xref: /dragonfly/contrib/gmp/mpn/generic/addsub_n.c (revision 86d7f5d3)
1*86d7f5d3SJohn Marino /* mpn_add_n_sub_n -- Add and Subtract two limb vectors of equal, non-zero length.
2*86d7f5d3SJohn Marino 
3*86d7f5d3SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
4*86d7f5d3SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5*86d7f5d3SJohn Marino    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6*86d7f5d3SJohn Marino 
7*86d7f5d3SJohn Marino Copyright 1999, 2000, 2001, 2006 Free Software Foundation, Inc.
8*86d7f5d3SJohn Marino 
9*86d7f5d3SJohn Marino This file is part of the GNU MP Library.
10*86d7f5d3SJohn Marino 
11*86d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
12*86d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
13*86d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
14*86d7f5d3SJohn Marino option) any later version.
15*86d7f5d3SJohn Marino 
16*86d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
17*86d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18*86d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19*86d7f5d3SJohn Marino License for more details.
20*86d7f5d3SJohn Marino 
21*86d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
22*86d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
23*86d7f5d3SJohn Marino 
24*86d7f5d3SJohn Marino #include "gmp.h"
25*86d7f5d3SJohn Marino #include "gmp-impl.h"
26*86d7f5d3SJohn Marino 
27*86d7f5d3SJohn Marino #ifndef L1_CACHE_SIZE
28*86d7f5d3SJohn Marino #define L1_CACHE_SIZE 8192	/* only 68040 has less than this */
29*86d7f5d3SJohn Marino #endif
30*86d7f5d3SJohn Marino 
31*86d7f5d3SJohn Marino #define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
32*86d7f5d3SJohn Marino 
33*86d7f5d3SJohn Marino 
34*86d7f5d3SJohn Marino /* mpn_add_n_sub_n.
35*86d7f5d3SJohn Marino    r1[] = s1[] + s2[]
36*86d7f5d3SJohn Marino    r2[] = s1[] - s2[]
37*86d7f5d3SJohn Marino    All operands have n limbs.
38*86d7f5d3SJohn Marino    In-place operations allowed.  */
39*86d7f5d3SJohn Marino mp_limb_t
mpn_add_n_sub_n(mp_ptr r1p,mp_ptr r2p,mp_srcptr s1p,mp_srcptr s2p,mp_size_t n)40*86d7f5d3SJohn Marino mpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
41*86d7f5d3SJohn Marino {
42*86d7f5d3SJohn Marino   mp_limb_t acyn, acyo;		/* carry for add */
43*86d7f5d3SJohn Marino   mp_limb_t scyn, scyo;		/* carry for subtract */
44*86d7f5d3SJohn Marino   mp_size_t off;		/* offset in operands */
45*86d7f5d3SJohn Marino   mp_size_t this_n;		/* size of current chunk */
46*86d7f5d3SJohn Marino 
47*86d7f5d3SJohn Marino   /* We alternatingly add and subtract in chunks that fit into the (L1)
48*86d7f5d3SJohn Marino      cache.  Since the chunks are several hundred limbs, the function call
49*86d7f5d3SJohn Marino      overhead is insignificant, but we get much better locality.  */
50*86d7f5d3SJohn Marino 
51*86d7f5d3SJohn Marino   /* We have three variant of the inner loop, the proper loop is chosen
52*86d7f5d3SJohn Marino      depending on whether r1 or r2 are the same operand as s1 or s2.  */
53*86d7f5d3SJohn Marino 
54*86d7f5d3SJohn Marino   if (r1p != s1p && r1p != s2p)
55*86d7f5d3SJohn Marino     {
56*86d7f5d3SJohn Marino       /* r1 is not identical to either input operand.  We can therefore write
57*86d7f5d3SJohn Marino 	 to r1 directly, without using temporary storage.  */
58*86d7f5d3SJohn Marino       acyo = 0;
59*86d7f5d3SJohn Marino       scyo = 0;
60*86d7f5d3SJohn Marino       for (off = 0; off < n; off += PART_SIZE)
61*86d7f5d3SJohn Marino 	{
62*86d7f5d3SJohn Marino 	  this_n = MIN (n - off, PART_SIZE);
63*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_nc
64*86d7f5d3SJohn Marino 	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
65*86d7f5d3SJohn Marino #else
66*86d7f5d3SJohn Marino 	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
67*86d7f5d3SJohn Marino 	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
68*86d7f5d3SJohn Marino #endif
69*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_sub_nc
70*86d7f5d3SJohn Marino 	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
71*86d7f5d3SJohn Marino #else
72*86d7f5d3SJohn Marino 	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
73*86d7f5d3SJohn Marino 	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
74*86d7f5d3SJohn Marino #endif
75*86d7f5d3SJohn Marino 	}
76*86d7f5d3SJohn Marino     }
77*86d7f5d3SJohn Marino   else if (r2p != s1p && r2p != s2p)
78*86d7f5d3SJohn Marino     {
79*86d7f5d3SJohn Marino       /* r2 is not identical to either input operand.  We can therefore write
80*86d7f5d3SJohn Marino 	 to r2 directly, without using temporary storage.  */
81*86d7f5d3SJohn Marino       acyo = 0;
82*86d7f5d3SJohn Marino       scyo = 0;
83*86d7f5d3SJohn Marino       for (off = 0; off < n; off += PART_SIZE)
84*86d7f5d3SJohn Marino 	{
85*86d7f5d3SJohn Marino 	  this_n = MIN (n - off, PART_SIZE);
86*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_sub_nc
87*86d7f5d3SJohn Marino 	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
88*86d7f5d3SJohn Marino #else
89*86d7f5d3SJohn Marino 	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
90*86d7f5d3SJohn Marino 	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
91*86d7f5d3SJohn Marino #endif
92*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_nc
93*86d7f5d3SJohn Marino 	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
94*86d7f5d3SJohn Marino #else
95*86d7f5d3SJohn Marino 	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
96*86d7f5d3SJohn Marino 	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
97*86d7f5d3SJohn Marino #endif
98*86d7f5d3SJohn Marino 	}
99*86d7f5d3SJohn Marino     }
100*86d7f5d3SJohn Marino   else
101*86d7f5d3SJohn Marino     {
102*86d7f5d3SJohn Marino       /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2==s2 or vice versa)
103*86d7f5d3SJohn Marino 	 Need temporary storage.  */
104*86d7f5d3SJohn Marino       mp_limb_t tp[PART_SIZE];
105*86d7f5d3SJohn Marino       acyo = 0;
106*86d7f5d3SJohn Marino       scyo = 0;
107*86d7f5d3SJohn Marino       for (off = 0; off < n; off += PART_SIZE)
108*86d7f5d3SJohn Marino 	{
109*86d7f5d3SJohn Marino 	  this_n = MIN (n - off, PART_SIZE);
110*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_nc
111*86d7f5d3SJohn Marino 	  acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
112*86d7f5d3SJohn Marino #else
113*86d7f5d3SJohn Marino 	  acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
114*86d7f5d3SJohn Marino 	  acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
115*86d7f5d3SJohn Marino #endif
116*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_sub_nc
117*86d7f5d3SJohn Marino 	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
118*86d7f5d3SJohn Marino #else
119*86d7f5d3SJohn Marino 	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
120*86d7f5d3SJohn Marino 	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
121*86d7f5d3SJohn Marino #endif
122*86d7f5d3SJohn Marino 	  MPN_COPY (r1p + off, tp, this_n);
123*86d7f5d3SJohn Marino 	}
124*86d7f5d3SJohn Marino     }
125*86d7f5d3SJohn Marino 
126*86d7f5d3SJohn Marino   return 2 * acyo + scyo;
127*86d7f5d3SJohn Marino }
128*86d7f5d3SJohn Marino 
129*86d7f5d3SJohn Marino #ifdef MAIN
130*86d7f5d3SJohn Marino #include <stdlib.h>
131*86d7f5d3SJohn Marino #include <stdio.h>
132*86d7f5d3SJohn Marino #include "timing.h"
133*86d7f5d3SJohn Marino 
134*86d7f5d3SJohn Marino long cputime ();
135*86d7f5d3SJohn Marino 
136*86d7f5d3SJohn Marino int
main(int argc,char ** argv)137*86d7f5d3SJohn Marino main (int argc, char **argv)
138*86d7f5d3SJohn Marino {
139*86d7f5d3SJohn Marino   mp_ptr r1p, r2p, s1p, s2p;
140*86d7f5d3SJohn Marino   double t;
141*86d7f5d3SJohn Marino   mp_size_t n;
142*86d7f5d3SJohn Marino 
143*86d7f5d3SJohn Marino   n = strtol (argv[1], 0, 0);
144*86d7f5d3SJohn Marino 
145*86d7f5d3SJohn Marino   r1p = malloc (n * BYTES_PER_MP_LIMB);
146*86d7f5d3SJohn Marino   r2p = malloc (n * BYTES_PER_MP_LIMB);
147*86d7f5d3SJohn Marino   s1p = malloc (n * BYTES_PER_MP_LIMB);
148*86d7f5d3SJohn Marino   s2p = malloc (n * BYTES_PER_MP_LIMB);
149*86d7f5d3SJohn Marino   TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
150*86d7f5d3SJohn Marino   printf ("              separate add and sub: %.3f\n", t);
151*86d7f5d3SJohn Marino   TIME (t,mpn_add_n_sub_n(r1p,r2p,s1p,s2p,n));
152*86d7f5d3SJohn Marino   printf ("combined addsub separate variables: %.3f\n", t);
153*86d7f5d3SJohn Marino   TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
154*86d7f5d3SJohn Marino   printf ("        combined addsub r1 overlap: %.3f\n", t);
155*86d7f5d3SJohn Marino   TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
156*86d7f5d3SJohn Marino   printf ("        combined addsub r2 overlap: %.3f\n", t);
157*86d7f5d3SJohn Marino   TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,r2p,n));
158*86d7f5d3SJohn Marino   printf ("          combined addsub in-place: %.3f\n", t);
159*86d7f5d3SJohn Marino 
160*86d7f5d3SJohn Marino   return 0;
161*86d7f5d3SJohn Marino }
162*86d7f5d3SJohn Marino #endif
163