1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 
22 /* rank-0, vector-rank-3, non-square in-place transposition
23    (see rank0.c for square transposition)  */
24 
25 #include "rdft/rdft.h"
26 
27 #ifdef HAVE_STRING_H
28 #include <string.h>		/* for memcpy() */
29 #endif
30 
31 struct P_s;
32 
33 typedef struct {
34      rdftapply apply;
35      int (*applicable)(const problem_rdft *p, planner *plnr,
36 		       int dim0, int dim1, int dim2, INT *nbuf);
37      int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego);
38      const char *nam;
39 } transpose_adt;
40 
41 typedef struct {
42      solver super;
43      const transpose_adt *adt;
44 } S;
45 
46 typedef struct P_s {
47      plan_rdft super;
48      INT n, m, vl; /* transpose n x m matrix of vl-tuples */
49      INT nbuf; /* buffer size */
50      INT nd, md, d; /* transpose-gcd params */
51      INT nc, mc; /* transpose-cut params */
52      plan *cld1, *cld2, *cld3; /* children, null if unused */
53      const S *slv;
54 } P;
55 
56 
57 /*************************************************************************/
58 /* some utilities for the solvers */
59 
gcd(INT a,INT b)60 static INT gcd(INT a, INT b)
61 {
62      INT r;
63      do {
64 	  r = a % b;
65 	  a = b;
66 	  b = r;
67      } while (r != 0);
68 
69      return a;
70 }
71 
72 /* whether we can transpose with one of our routines expecting
73    contiguous Ntuples */
Ntuple_transposable(const iodim * a,const iodim * b,INT vl,INT vs)74 static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs)
75 {
76      return (vs == 1 && b->is == vl && a->os == vl &&
77 	     ((a->n == b->n && a->is == b->os
78 	       && a->is >= b->n && a->is % vl == 0)
79 	      || (a->is == b->n * vl && b->os == a->n * vl)));
80 }
81 
82 /* check whether a and b correspond to the first and second dimensions
83    of a transpose of tuples with vector length = vl, stride = vs. */
transposable(const iodim * a,const iodim * b,INT vl,INT vs)84 static int transposable(const iodim *a, const iodim *b, INT vl, INT vs)
85 {
86      return ((a->n == b->n && a->os == b->is && a->is == b->os)
87              || Ntuple_transposable(a, b, vl, vs));
88 }
89 
pickdim(const tensor * s,int * pdim0,int * pdim1,int * pdim2)90 static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2)
91 {
92      int dim0, dim1;
93 
94      for (dim0 = 0; dim0 < s->rnk; ++dim0)
95           for (dim1 = 0; dim1 < s->rnk; ++dim1) {
96 	       int dim2 = 3 - dim0 - dim1;
97 	       if (dim0 == dim1) continue;
98                if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os)
99 		   && transposable(s->dims + dim0, s->dims + dim1,
100 				   s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
101 				   s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
102                     *pdim0 = dim0;
103                     *pdim1 = dim1;
104 		    *pdim2 = dim2;
105                     return 1;
106                }
107 	  }
108      return 0;
109 }
110 
111 #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
112 #define MAXBUF 65536 /* maximum non-ugly buffer */
113 
114 /* generic applicability function */
applicable(const solver * ego_,const problem * p_,planner * plnr,int * dim0,int * dim1,int * dim2,INT * nbuf)115 static int applicable(const solver *ego_, const problem *p_, planner *plnr,
116 		      int *dim0, int *dim1, int *dim2, INT *nbuf)
117 {
118      const S *ego = (const S *) ego_;
119      const problem_rdft *p = (const problem_rdft *) p_;
120 
121      return (1
122 	     && p->I == p->O
123 	     && p->sz->rnk == 0
124 	     && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3)
125 
126 	     && pickdim(p->vecsz, dim0, dim1, dim2)
127 
128 	     /* UGLY if vecloop in wrong order for locality */
129 	     && (!NO_UGLYP(plnr) ||
130 		 p->vecsz->rnk == 2 ||
131 		 X(iabs)(p->vecsz->dims[*dim2].is)
132 		 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
133 			   X(iabs)(p->vecsz->dims[*dim0].os)))
134 
135 	     /* SLOW if non-square */
136 	     && (!NO_SLOWP(plnr)
137 		 || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n)
138 
139 	     && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf)
140 
141 	     /* buffers too big are UGLY */
142 	     && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
143 		 || *nbuf <= MAXBUF
144 		 || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz))
145 	  );
146 }
147 
get_transpose_vec(const problem_rdft * p,int dim2,INT * vl,INT * vs)148 static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs)
149 {
150      if (p->vecsz->rnk == 2) {
151 	  *vl = 1; *vs = 1;
152      }
153      else {
154 	  *vl = p->vecsz->dims[dim2].n;
155 	  *vs = p->vecsz->dims[dim2].is; /* == os */
156      }
157 }
158 
159 /*************************************************************************/
160 /* Cache-oblivious in-place transpose of non-square matrices, based
161    on transposes of blocks given by the gcd of the dimensions.
162 
163    This algorithm is related to algorithm V5 from Murray Dow,
164    "Transposing a matrix on a vector computer," Parallel Computing 21
165    (12), 1997-2005 (1995), with the modification that we use
166    cache-oblivious recursive transpose subroutines (and we derived
167    it independently).
168 
169    For a p x q matrix, this requires scratch space equal to the size
170    of the matrix divided by gcd(p,q).  Alternatively, see also the
171    "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */
172 
apply_gcd(const plan * ego_,R * I,R * O)173 static void apply_gcd(const plan *ego_, R *I, R *O)
174 {
175      const P *ego = (const P *) ego_;
176      INT n = ego->nd, m = ego->md, d = ego->d;
177      INT vl = ego->vl;
178      R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
179      INT i, num_el = n*m*d*vl;
180 
181      A(ego->n == n * d && ego->m == m * d);
182      UNUSED(O);
183 
184      /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix
185 	of vl-tuples and buf contains n*m*d*vl elements.
186 
187 	In general, to transpose a p x q matrix, you should call this
188 	routine with d = gcd(p, q), n = p/d, and m = q/d.  */
189 
190      A(n > 0 && m > 0 && vl > 0);
191      A(d > 1);
192 
193      /* treat as (d x n) x (d' x m) matrix.  (d' = d) */
194 
195      /* First, transpose d x (n x d') x m to d x (d' x n) x m,
196 	using the buf matrix.  This consists of d transposes
197 	of contiguous n x d' matrices of m-tuples. */
198      if (n > 1) {
199 	  rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
200 	  for (i = 0; i < d; ++i) {
201 	       cldapply(ego->cld1, I + i*num_el, buf);
202 	       memcpy(I + i*num_el, buf, num_el*sizeof(R));
203 	  }
204      }
205 
206      /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
207 	is a square in-place transpose of n*m-tuples: */
208      {
209 	  rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
210 	  cldapply(ego->cld2, I, I);
211      }
212 
213      /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
214 	using the buf matrix.  This consists of d' transposes
215 	of contiguous d*n x m matrices. */
216      if (m > 1) {
217 	  rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
218 	  for (i = 0; i < d; ++i) {
219 	       cldapply(ego->cld3, I + i*num_el, buf);
220 	       memcpy(I + i*num_el, buf, num_el*sizeof(R));
221 	  }
222      }
223 
224      X(ifree)(buf);
225 }
226 
applicable_gcd(const problem_rdft * p,planner * plnr,int dim0,int dim1,int dim2,INT * nbuf)227 static int applicable_gcd(const problem_rdft *p, planner *plnr,
228 			  int dim0, int dim1, int dim2, INT *nbuf)
229 {
230      INT n = p->vecsz->dims[dim0].n;
231      INT m = p->vecsz->dims[dim1].n;
232      INT d, vl, vs;
233      get_transpose_vec(p, dim2, &vl, &vs);
234      d = gcd(n, m);
235      *nbuf = n * (m / d) * vl;
236      return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
237 	     && n != m
238 	     && d > 1
239 	     && Ntuple_transposable(p->vecsz->dims + dim0,
240 				    p->vecsz->dims + dim1,
241 				    vl, vs));
242 }
243 
mkcldrn_gcd(const problem_rdft * p,planner * plnr,P * ego)244 static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego)
245 {
246      INT n = ego->nd, m = ego->md, d = ego->d;
247      INT vl = ego->vl;
248      R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
249      INT num_el = n*m*d*vl;
250 
251      if (n > 1) {
252 	  ego->cld1 = X(mkplan_d)(plnr,
253 				  X(mkproblem_rdft_0_d)(
254 				       X(mktensor_3d)(n, d*m*vl, m*vl,
255 						      d, m*vl, n*m*vl,
256 						      m*vl, 1, 1),
257 				       TAINT(p->I, num_el), buf));
258 	  if (!ego->cld1)
259 	       goto nada;
260 	  X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
261 		      &ego->super.super.ops);
262 	  ego->super.super.ops.other += num_el * d * 2;
263      }
264 
265      ego->cld2 = X(mkplan_d)(plnr,
266 			     X(mkproblem_rdft_0_d)(
267 				  X(mktensor_3d)(d, d*n*m*vl, n*m*vl,
268 						 d, n*m*vl, d*n*m*vl,
269 						 n*m*vl, 1, 1),
270 				  p->I, p->I));
271      if (!ego->cld2)
272 	  goto nada;
273      X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
274 
275      if (m > 1) {
276 	  ego->cld3 = X(mkplan_d)(plnr,
277 				  X(mkproblem_rdft_0_d)(
278 				       X(mktensor_3d)(d*n, m*vl, vl,
279 						      m, vl, d*n*vl,
280 						      vl, 1, 1),
281 				       TAINT(p->I, num_el), buf));
282 	  if (!ego->cld3)
283 	       goto nada;
284 	  X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
285 	  ego->super.super.ops.other += num_el * d * 2;
286      }
287 
288      X(ifree)(buf);
289      return 1;
290 
291  nada:
292      X(ifree)(buf);
293      return 0;
294 }
295 
296 static const transpose_adt adt_gcd =
297 {
298      apply_gcd, applicable_gcd, mkcldrn_gcd,
299      "rdft-transpose-gcd"
300 };
301 
302 /*************************************************************************/
303 /* Cache-oblivious in-place transpose of non-square n x m matrices,
304    based on transposing a sub-matrix first and then transposing the
305    remainder(s) with the help of a buffer.  See also transpose-gcd,
306    above, if gcd(n,m) is large.
307 
308    This algorithm is related to algorithm V3 from Murray Dow,
309    "Transposing a matrix on a vector computer," Parallel Computing 21
310    (12), 1997-2005 (1995), with the modifications that we use
311    cache-oblivious recursive transpose subroutines and we have the
312    generalization for large |n-m| below.
313 
314    The best case, and the one described by Dow, is for |n-m| small, in
315    which case we transpose a square sub-matrix of size min(n,m),
316    handling the remainder via a buffer.  This requires scratch space
317    equal to the size of the matrix times |n-m| / max(n,m).
318 
319    As a generalization when |n-m| is not small, we also support cutting
320    *both* dimensions to an nc x mc matrix which is *not* necessarily
321    square, but has a large gcd (and can therefore use transpose-gcd).
322 */
323 
apply_cut(const plan * ego_,R * I,R * O)324 static void apply_cut(const plan *ego_, R *I, R *O)
325 {
326      const P *ego = (const P *) ego_;
327      INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
328      INT i;
329      R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
330      UNUSED(O);
331 
332      if (m > mc) {
333 	  ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1);
334 	  for (i = 0; i < nc; ++i)
335 	       memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl));
336      }
337 
338      ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */
339 
340      if (n > nc) {
341 	  R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */
342 	  memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R));
343 	  for (i = mc-1; i >= 0; --i)
344 	       memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl));
345 	  ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl);
346      }
347 
348      if (m > mc) {
349 	  if (n > nc)
350 	       for (i = mc; i < m; ++i)
351 		    memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl),
352 			   (nc*vl)*sizeof(R));
353 	  else
354 	       memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R));
355      }
356 
357      X(ifree)(buf1);
358 }
359 
360 /* only cut one dimension if the resulting buffer is small enough */
cut1(INT n,INT m,INT vl)361 static int cut1(INT n, INT m, INT vl)
362 {
363      return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
364 	     || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
365 }
366 
367 #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
368 
applicable_cut(const problem_rdft * p,planner * plnr,int dim0,int dim1,int dim2,INT * nbuf)369 static int applicable_cut(const problem_rdft *p, planner *plnr,
370 			  int dim0, int dim1, int dim2, INT *nbuf)
371 {
372      INT n = p->vecsz->dims[dim0].n;
373      INT m = p->vecsz->dims[dim1].n;
374      INT vl, vs;
375      get_transpose_vec(p, dim2, &vl, &vs);
376      *nbuf = 0; /* always small enough to be non-UGLY (?) */
377      A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
378      return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
379 	     && n != m
380 
381 	     /* Don't call transpose-cut recursively (avoid inf. loops):
382 	        the non-square sub-transpose produced when !cut1
383 	        should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
384 	        for which transpose-gcd is applicable */
385 	     && (cut1(n, m, vl)
386 		 || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
387 
388 	     && Ntuple_transposable(p->vecsz->dims + dim0,
389 				    p->vecsz->dims + dim1,
390 				    vl, vs));
391 }
392 
mkcldrn_cut(const problem_rdft * p,planner * plnr,P * ego)393 static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego)
394 {
395      INT n = ego->n, m = ego->m, nc, mc;
396      INT vl = ego->vl;
397      R *buf;
398 
399      /* pick the "best" cut */
400      if (cut1(n, m, vl)) {
401 	  nc = mc = X(imin)(n,m);
402      }
403      else {
404 	  INT dc, ns, ms;
405 	  dc = gcd(m, n); nc = n; mc = m;
406 	  /* search for cut with largest gcd
407 	     (TODO: different optimality criteria? different search range?) */
408 	  for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
409 	       for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
410 		    INT ds = gcd(ms, ns);
411 		    if (ds > dc) {
412 			 dc = ds; nc = ns; mc = ms;
413 			 if (dc == X(imin)(ns, ms))
414 			      break; /* cannot get larger than this */
415 		    }
416 	       }
417 	       if (dc == X(imin)(n, ms))
418 		    break; /* cannot get larger than this */
419 	  }
420 	  A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
421      }
422      ego->nc = nc;
423      ego->mc = mc;
424      ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl);
425 
426      buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
427 
428      if (m > mc) {
429 	  ego->cld1 = X(mkplan_d)(plnr,
430 				  X(mkproblem_rdft_0_d)(
431 				       X(mktensor_3d)(nc, m*vl, vl,
432 						      m-mc, vl, nc*vl,
433 						      vl, 1, 1),
434 				       p->I + mc*vl, buf));
435 	  if (!ego->cld1)
436 	       goto nada;
437 	  X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
438      }
439 
440      ego->cld2 = X(mkplan_d)(plnr,
441 			     X(mkproblem_rdft_0_d)(
442 				  X(mktensor_3d)(nc, mc*vl, vl,
443 						 mc, vl, nc*vl,
444 						 vl, 1, 1),
445 				  p->I, p->I));
446      if (!ego->cld2)
447 	  goto nada;
448      X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
449 
450      if (n > nc) {
451 	  ego->cld3 = X(mkplan_d)(plnr,
452 				  X(mkproblem_rdft_0_d)(
453 				       X(mktensor_3d)(n-nc, m*vl, vl,
454 						      m, vl, n*vl,
455 						      vl, 1, 1),
456 				       buf + (m-mc)*(nc*vl), p->I + nc*vl));
457 	  if (!ego->cld3)
458 	       goto nada;
459 	  X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
460      }
461 
462      /* memcpy/memmove operations */
463      ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc))
464 					     + (n-nc)*m + (m-mc)*nc);
465 
466      X(ifree)(buf);
467      return 1;
468 
469  nada:
470      X(ifree)(buf);
471      return 0;
472 }
473 
474 static const transpose_adt adt_cut =
475 {
476      apply_cut, applicable_cut, mkcldrn_cut,
477      "rdft-transpose-cut"
478 };
479 
480 /*************************************************************************/
481 /* In-place transpose routine from TOMS, which follows the cycles of
482    the permutation so that it writes to each location only once.
483    Because of cache-line and other issues, however, this routine is
484    typically much slower than transpose-gcd or transpose-cut, even
485    though the latter do some extra writes.  On the other hand, if the
486    vector length is large then the TOMS routine is best.
487 
488    The TOMS routine also has the advantage of requiring less buffer
489    space for the case of gcd(nx,ny) small.  However, in this case it
490    has been superseded by the combination of the generalized
491    transpose-cut method with the transpose-gcd method, which can
492    always transpose with buffers a small fraction of the array size
493    regardless of gcd(nx,ny). */
494 
495 /*
496  * TOMS Transpose.  Algorithm 513 (Revised version of algorithm 380).
497  *
498  * These routines do in-place transposes of arrays.
499  *
500  * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software,
501  *   vol. 3, no. 1, 104-110 (1977) ]
502  *
503  * C version by Steven G. Johnson (February 1997).
504  */
505 
506 /*
507  * "a" is a 1D array of length ny*nx*N which constains the nx x ny
508  * matrix of N-tuples to be transposed.  "a" is stored in row-major
509  * order (last index varies fastest).  move is a 1D array of length
510  * move_size used to store information to speed up the process.  The
511  * value move_size=(ny+nx)/2 is recommended.  buf should be an array
512  * of length 2*N.
513  *
514  */
515 
transpose_toms513(R * a,INT nx,INT ny,INT N,char * move,INT move_size,R * buf)516 static void transpose_toms513(R *a, INT nx, INT ny, INT N,
517                               char *move, INT move_size, R *buf)
518 {
519      INT i, im, mn;
520      R *b, *c, *d;
521      INT ncount;
522      INT k;
523 
524      /* check arguments and initialize: */
525      A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
526 
527      b = buf;
528 
529      /* Cate & Twigg have a special case for nx == ny, but we don't
530 	bother, since we already have special code for this case elsewhere. */
531 
532      c = buf + N;
533      ncount = 2;		/* always at least 2 fixed points */
534      k = (mn = ny * nx) - 1;
535 
536      for (i = 0; i < move_size; ++i)
537 	  move[i] = 0;
538 
539      if (ny >= 3 && nx >= 3)
540 	  ncount += gcd(ny - 1, nx - 1) - 1;	/* # fixed points */
541 
542      i = 1;
543      im = ny;
544 
545      while (1) {
546 	  INT i1, i2, i1c, i2c;
547 	  INT kmi;
548 
549 	  /** Rearrange the elements of a loop
550 	      and its companion loop: **/
551 
552 	  i1 = i;
553 	  kmi = k - i;
554 	  i1c = kmi;
555 	  switch (N) {
556 	      case 1:
557 		   b[0] = a[i1];
558 		   c[0] = a[i1c];
559 		   break;
560 	      case 2:
561 		   b[0] = a[2*i1];
562 		   b[1] = a[2*i1+1];
563 		   c[0] = a[2*i1c];
564 		   c[1] = a[2*i1c+1];
565 		   break;
566 	      default:
567 		   memcpy(b, &a[N * i1], N * sizeof(R));
568 		   memcpy(c, &a[N * i1c], N * sizeof(R));
569 	  }
570 	  while (1) {
571 	       i2 = ny * i1 - k * (i1 / nx);
572 	       i2c = k - i2;
573 	       if (i1 < move_size)
574 		    move[i1] = 1;
575 	       if (i1c < move_size)
576 		    move[i1c] = 1;
577 	       ncount += 2;
578 	       if (i2 == i)
579 		    break;
580 	       if (i2 == kmi) {
581 		    d = b;
582 		    b = c;
583 		    c = d;
584 		    break;
585 	       }
586 	       switch (N) {
587 		   case 1:
588 			a[i1] = a[i2];
589 			a[i1c] = a[i2c];
590 			break;
591 		   case 2:
592 			a[2*i1] = a[2*i2];
593 			a[2*i1+1] = a[2*i2+1];
594 			a[2*i1c] = a[2*i2c];
595 			a[2*i1c+1] = a[2*i2c+1];
596 			break;
597 		   default:
598 			memcpy(&a[N * i1], &a[N * i2],
599 			       N * sizeof(R));
600 			memcpy(&a[N * i1c], &a[N * i2c],
601 			       N * sizeof(R));
602 	       }
603 	       i1 = i2;
604 	       i1c = i2c;
605 	  }
606 	  switch (N) {
607 	      case 1:
608 		   a[i1] = b[0];
609 		   a[i1c] = c[0];
610 		   break;
611 	      case 2:
612 		   a[2*i1] = b[0];
613 		   a[2*i1+1] = b[1];
614 		   a[2*i1c] = c[0];
615 		   a[2*i1c+1] = c[1];
616 		   break;
617 	      default:
618 		   memcpy(&a[N * i1], b, N * sizeof(R));
619 		   memcpy(&a[N * i1c], c, N * sizeof(R));
620 	  }
621 	  if (ncount >= mn)
622 	       break;	/* we've moved all elements */
623 
624 	  /** Search for loops to rearrange: **/
625 
626 	  while (1) {
627 	       INT max = k - i;
628 	       ++i;
629 	       A(i <= max);
630 	       im += ny;
631 	       if (im > k)
632 		    im -= k;
633 	       i2 = im;
634 	       if (i == i2)
635 		    continue;
636 	       if (i >= move_size) {
637 		    while (i2 > i && i2 < max) {
638 			 i1 = i2;
639 			 i2 = ny * i1 - k * (i1 / nx);
640 		    }
641 		    if (i2 == i)
642 			 break;
643 	       } else if (!move[i])
644 		    break;
645 	  }
646      }
647 }
648 
apply_toms513(const plan * ego_,R * I,R * O)649 static void apply_toms513(const plan *ego_, R *I, R *O)
650 {
651      const P *ego = (const P *) ego_;
652      INT n = ego->n, m = ego->m;
653      INT vl = ego->vl;
654      R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
655      UNUSED(O);
656      transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
657      X(ifree)(buf);
658 }
659 
applicable_toms513(const problem_rdft * p,planner * plnr,int dim0,int dim1,int dim2,INT * nbuf)660 static int applicable_toms513(const problem_rdft *p, planner *plnr,
661 			   int dim0, int dim1, int dim2, INT *nbuf)
662 {
663      INT n = p->vecsz->dims[dim0].n;
664      INT m = p->vecsz->dims[dim1].n;
665      INT vl, vs;
666      get_transpose_vec(p, dim2, &vl, &vs);
667      *nbuf = 2*vl
668 	  + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
669      return (!NO_SLOWP(plnr)
670 	     && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */
671 	     && n != m
672 	     && Ntuple_transposable(p->vecsz->dims + dim0,
673 				    p->vecsz->dims + dim1,
674 				    vl, vs));
675 }
676 
mkcldrn_toms513(const problem_rdft * p,planner * plnr,P * ego)677 static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego)
678 {
679      UNUSED(p); UNUSED(plnr);
680      /* heuristic so that TOMS algorithm is last resort for small vl */
681      ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
682      return 1;
683 }
684 
685 static const transpose_adt adt_toms513 =
686 {
687      apply_toms513, applicable_toms513, mkcldrn_toms513,
688      "rdft-transpose-toms513"
689 };
690 
691 /*-----------------------------------------------------------------------*/
692 /*-----------------------------------------------------------------------*/
693 /* generic stuff: */
694 
awake(plan * ego_,enum wakefulness wakefulness)695 static void awake(plan *ego_, enum wakefulness wakefulness)
696 {
697      P *ego = (P *) ego_;
698      X(plan_awake)(ego->cld1, wakefulness);
699      X(plan_awake)(ego->cld2, wakefulness);
700      X(plan_awake)(ego->cld3, wakefulness);
701 }
702 
print(const plan * ego_,printer * p)703 static void print(const plan *ego_, printer *p)
704 {
705      const P *ego = (const P *) ego_;
706      p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
707 	      ego->n, ego->m, ego->vl);
708      if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
709      if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
710      if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
711      p->print(p, ")");
712 }
713 
destroy(plan * ego_)714 static void destroy(plan *ego_)
715 {
716      P *ego = (P *) ego_;
717      X(plan_destroy_internal)(ego->cld3);
718      X(plan_destroy_internal)(ego->cld2);
719      X(plan_destroy_internal)(ego->cld1);
720 }
721 
mkplan(const solver * ego_,const problem * p_,planner * plnr)722 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
723 {
724      const S *ego = (const S *) ego_;
725      const problem_rdft *p;
726      int dim0, dim1, dim2;
727      INT nbuf, vs;
728      P *pln;
729 
730      static const plan_adt padt = {
731 	  X(rdft_solve), awake, print, destroy
732      };
733 
734      if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
735           return (plan *) 0;
736 
737      p = (const problem_rdft *) p_;
738      pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
739 
740      pln->n = p->vecsz->dims[dim0].n;
741      pln->m = p->vecsz->dims[dim1].n;
742      get_transpose_vec(p, dim2, &pln->vl, &vs);
743      pln->nbuf = nbuf;
744      pln->d = gcd(pln->n, pln->m);
745      pln->nd = pln->n / pln->d;
746      pln->md = pln->m / pln->d;
747      pln->slv = ego;
748 
749      X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
750 
751      pln->cld1 = pln->cld2 = pln->cld3 = 0;
752      if (!ego->adt->mkcldrn(p, plnr, pln)) {
753 	  X(plan_destroy_internal)(&(pln->super.super));
754 	  return 0;
755      }
756 
757      return &(pln->super.super);
758 }
759 
mksolver(const transpose_adt * adt)760 static solver *mksolver(const transpose_adt *adt)
761 {
762      static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
763      S *slv = MKSOLVER(S, &sadt);
764      slv->adt = adt;
765      return &(slv->super);
766 }
767 
X(rdft_vrank3_transpose_register)768 void X(rdft_vrank3_transpose_register)(planner *p)
769 {
770      unsigned i;
771      static const transpose_adt *const adts[] = {
772 	  &adt_gcd, &adt_cut,
773 	  &adt_toms513
774      };
775      for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
776           REGISTER_SOLVER(p, mksolver(adts[i]));
777 }
778