1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 
22 /* buffering of rdft2.  We always buffer the complex array */
23 
24 #include "rdft/rdft.h"
25 #include "dft/dft.h"
26 
27 typedef struct {
28      solver super;
29      size_t maxnbuf_ndx;
30 } S;
31 
32 static const INT maxnbufs[] = { 8, 256 };
33 
34 typedef struct {
35      plan_rdft2 super;
36 
37      plan *cld, *cldcpy, *cldrest;
38      INT n, vl, nbuf, bufdist;
39      INT ivs_by_nbuf, ovs_by_nbuf;
40      INT ioffset, roffset;
41 } P;
42 
43 /* transform a vector input with the help of bufs */
apply_r2hc(const plan * ego_,R * r0,R * r1,R * cr,R * ci)44 static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
45 {
46      const P *ego = (const P *) ego_;
47      plan_rdft2 *cld = (plan_rdft2 *) ego->cld;
48      plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
49      INT i, vl = ego->vl, nbuf = ego->nbuf;
50      INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
51      R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
52      R *bufr = bufs + ego->roffset;
53      R *bufi = bufs + ego->ioffset;
54      plan_rdft2 *cldrest;
55 
56      for (i = nbuf; i <= vl; i += nbuf) {
57           /* transform to bufs: */
58           cld->apply((plan *) cld, r0, r1, bufr, bufi);
59 	  r0 += ivs_by_nbuf; r1 += ivs_by_nbuf;
60 
61           /* copy back */
62           cldcpy->apply((plan *) cldcpy, bufr, bufi, cr, ci);
63 	  cr += ovs_by_nbuf; ci += ovs_by_nbuf;
64      }
65 
66      X(ifree)(bufs);
67 
68      /* Do the remaining transforms, if any: */
69      cldrest = (plan_rdft2 *) ego->cldrest;
70      cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
71 }
72 
73 /* for hc2r problems, copy the input into buffer, and then
74    transform buffer->output, which allows for destruction of the
75    buffer */
apply_hc2r(const plan * ego_,R * r0,R * r1,R * cr,R * ci)76 static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
77 {
78      const P *ego = (const P *) ego_;
79      plan_rdft2 *cld = (plan_rdft2 *) ego->cld;
80      plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
81      INT i, vl = ego->vl, nbuf = ego->nbuf;
82      INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
83      R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
84      R *bufr = bufs + ego->roffset;
85      R *bufi = bufs + ego->ioffset;
86      plan_rdft2 *cldrest;
87 
88      for (i = nbuf; i <= vl; i += nbuf) {
89           /* copy input into bufs: */
90           cldcpy->apply((plan *) cldcpy, cr, ci, bufr, bufi);
91 	  cr += ivs_by_nbuf; ci += ivs_by_nbuf;
92 
93           /* transform to output */
94           cld->apply((plan *) cld, r0, r1, bufr, bufi);
95 	  r0 += ovs_by_nbuf; r1 += ovs_by_nbuf;
96      }
97 
98      X(ifree)(bufs);
99 
100      /* Do the remaining transforms, if any: */
101      cldrest = (plan_rdft2 *) ego->cldrest;
102      cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
103 }
104 
105 
awake(plan * ego_,enum wakefulness wakefulness)106 static void awake(plan *ego_, enum wakefulness wakefulness)
107 {
108      P *ego = (P *) ego_;
109 
110      X(plan_awake)(ego->cld, wakefulness);
111      X(plan_awake)(ego->cldcpy, wakefulness);
112      X(plan_awake)(ego->cldrest, wakefulness);
113 }
114 
destroy(plan * ego_)115 static void destroy(plan *ego_)
116 {
117      P *ego = (P *) ego_;
118      X(plan_destroy_internal)(ego->cldrest);
119      X(plan_destroy_internal)(ego->cldcpy);
120      X(plan_destroy_internal)(ego->cld);
121 }
122 
print(const plan * ego_,printer * p)123 static void print(const plan *ego_, printer *p)
124 {
125      const P *ego = (const P *) ego_;
126      p->print(p, "(rdft2-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
127               ego->n, ego->nbuf,
128               ego->vl, ego->bufdist % ego->n,
129               ego->cld, ego->cldcpy, ego->cldrest);
130 }
131 
applicable0(const S * ego,const problem * p_,const planner * plnr)132 static int applicable0(const S *ego, const problem *p_, const planner *plnr)
133 {
134      const problem_rdft2 *p = (const problem_rdft2 *) p_;
135      iodim *d = p->sz->dims;
136 
137      if (1
138 	 && p->vecsz->rnk <= 1
139 	 && p->sz->rnk == 1
140 
141 	 /* we assume even n throughout */
142 	 && (d[0].n % 2) == 0
143 
144 	 /* and we only consider these two cases */
145 	 && (p->kind == R2HC || p->kind == HC2R)
146 
147 	  ) {
148 	  INT vl, ivs, ovs;
149 	  X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
150 
151 	  if (X(toobig)(d[0].n) && CONSERVE_MEMORYP(plnr))
152 	       return 0;
153 
154 	  /* if this solver is redundant, in the sense that a solver
155 	     of lower index generates the same plan, then prune this
156 	     solver */
157 	  if (X(nbuf_redundant)(d[0].n, vl,
158 				ego->maxnbuf_ndx,
159 				maxnbufs, NELEM(maxnbufs)))
160 	       return 0;
161 
162 	  if (p->r0 != p->cr) {
163 	       if (p->kind == HC2R) {
164 		    /* Allow HC2R problems only if the input is to be
165 		       preserved.  This solver sets NO_DESTROY_INPUT,
166 		       which prevents infinite loops */
167 		    return (NO_DESTROY_INPUTP(plnr));
168 	       } else {
169 		    /*
170 		      In principle, the buffered transforms might be useful
171 		      when working out of place.  However, in order to
172 		      prevent infinite loops in the planner, we require
173 		      that the output stride of the buffered transforms be
174 		      greater than 2.
175 		    */
176 		    return (d[0].os > 2);
177 	       }
178 	  }
179 
180 	  /*
181 	   * If the problem is in place, the input/output strides must
182 	   * be the same or the whole thing must fit in the buffer.
183 	   */
184 	  if (X(rdft2_inplace_strides(p, RNK_MINFTY)))
185 	       return 1;
186 
187 	  if (/* fits into buffer: */
188 	       ((p->vecsz->rnk == 0)
189 		||
190 		(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
191 			 maxnbufs[ego->maxnbuf_ndx])
192 		 == p->vecsz->dims[0].n)))
193 	       return 1;
194      }
195 
196      return 0;
197 }
198 
applicable(const S * ego,const problem * p_,const planner * plnr)199 static int applicable(const S *ego, const problem *p_, const planner *plnr)
200 {
201      const problem_rdft2 *p;
202 
203      if (NO_BUFFERINGP(plnr)) return 0;
204 
205      if (!applicable0(ego, p_, plnr)) return 0;
206 
207      p = (const problem_rdft2 *) p_;
208      if (p->kind == HC2R) {
209 	  if (NO_UGLYP(plnr)) {
210 	       /* UGLY if in-place and too big, since the problem
211 		  could be solved via transpositions */
212 	       if (p->r0 == p->cr && X(toobig)(p->sz->dims[0].n))
213 		    return 0;
214 	  }
215      } else {
216 	  if (NO_UGLYP(plnr)) {
217 	       if (p->r0 != p->cr || X(toobig)(p->sz->dims[0].n))
218 		    return 0;
219 	  }
220      }
221      return 1;
222 }
223 
mkplan(const solver * ego_,const problem * p_,planner * plnr)224 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
225 {
226      P *pln;
227      const S *ego = (const S *)ego_;
228      plan *cld = (plan *) 0;
229      plan *cldcpy = (plan *) 0;
230      plan *cldrest = (plan *) 0;
231      const problem_rdft2 *p = (const problem_rdft2 *) p_;
232      R *bufs = (R *) 0;
233      INT nbuf = 0, bufdist, n, vl;
234      INT ivs, ovs, ioffset, roffset, id, od;
235 
236      static const plan_adt padt = {
237 	  X(rdft2_solve), awake, print, destroy
238      };
239 
240      if (!applicable(ego, p_, plnr))
241           goto nada;
242 
243      n = X(tensor_sz)(p->sz);
244      X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
245 
246      nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
247      bufdist = X(bufdist)(n + 2, vl); /* complex-side rdft2 stores N+2
248 					 real numbers */
249      A(nbuf > 0);
250 
251      /* attempt to keep real and imaginary part in the same order,
252 	so as to allow optimizations in the the copy plan */
253      roffset = (p->cr - p->ci > 0) ? (INT)1 : (INT)0;
254      ioffset = 1 - roffset;
255 
256      /* initial allocation for the purpose of planning */
257      bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
258 
259      id = ivs * (nbuf * (vl / nbuf));
260      od = ovs * (nbuf * (vl / nbuf));
261 
262      if (p->kind == R2HC) {
263 	  /* allow destruction of input if problem is in place */
264 	  cld = X(mkplan_f_d)(
265 	       plnr,
266 	       X(mkproblem_rdft2_d)(
267 		    X(mktensor_1d)(n, p->sz->dims[0].is, 2),
268 		    X(mktensor_1d)(nbuf, ivs, bufdist),
269 		    TAINT(p->r0, ivs * nbuf), TAINT(p->r1, ivs * nbuf),
270 		    bufs + roffset, bufs + ioffset, p->kind),
271 	       0, 0, (p->r0 == p->cr) ? NO_DESTROY_INPUT : 0);
272 	  if (!cld) goto nada;
273 
274 	  /* copying back from the buffer is a rank-0 DFT: */
275 	  cldcpy = X(mkplan_d)(
276 	       plnr,
277 	       X(mkproblem_dft_d)(
278 		    X(mktensor_0d)(),
279 		    X(mktensor_2d)(nbuf, bufdist, ovs,
280 				   n/2+1, 2, p->sz->dims[0].os),
281 		    bufs + roffset, bufs + ioffset,
282 		    TAINT(p->cr, ovs * nbuf), TAINT(p->ci, ovs * nbuf) ));
283 	  if (!cldcpy) goto nada;
284 
285 	  X(ifree)(bufs); bufs = 0;
286 
287 	  cldrest = X(mkplan_d)(plnr,
288 				X(mkproblem_rdft2_d)(
289 				     X(tensor_copy)(p->sz),
290 				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
291 				     p->r0 + id, p->r1 + id,
292 				     p->cr + od, p->ci + od,
293 				     p->kind));
294 	  if (!cldrest) goto nada;
295 	  pln = MKPLAN_RDFT2(P, &padt, apply_r2hc);
296      } else {
297 	  /* allow destruction of buffer */
298 	  cld = X(mkplan_f_d)(
299 	       plnr,
300 	       X(mkproblem_rdft2_d)(
301 		    X(mktensor_1d)(n, 2, p->sz->dims[0].os),
302 		    X(mktensor_1d)(nbuf, bufdist, ovs),
303 		    TAINT(p->r0, ovs * nbuf), TAINT(p->r1, ovs * nbuf),
304 		    bufs + roffset, bufs + ioffset, p->kind),
305 	       0, 0, NO_DESTROY_INPUT);
306 	  if (!cld) goto nada;
307 
308 	  /* copying input into buffer is a rank-0 DFT: */
309 	  cldcpy = X(mkplan_d)(
310 	       plnr,
311 	       X(mkproblem_dft_d)(
312 		    X(mktensor_0d)(),
313 		    X(mktensor_2d)(nbuf, ivs, bufdist,
314 				   n/2+1, p->sz->dims[0].is, 2),
315 		    TAINT(p->cr, ivs * nbuf), TAINT(p->ci, ivs * nbuf),
316 		    bufs + roffset, bufs + ioffset));
317 	  if (!cldcpy) goto nada;
318 
319 	  X(ifree)(bufs); bufs = 0;
320 
321 	  cldrest = X(mkplan_d)(plnr,
322 				X(mkproblem_rdft2_d)(
323 				     X(tensor_copy)(p->sz),
324 				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
325 				     p->r0 + od, p->r1 + od,
326 				     p->cr + id, p->ci + id,
327 				     p->kind));
328 	  if (!cldrest) goto nada;
329 
330 	  pln = MKPLAN_RDFT2(P, &padt, apply_hc2r);
331      }
332 
333      pln->cld = cld;
334      pln->cldcpy = cldcpy;
335      pln->cldrest = cldrest;
336      pln->n = n;
337      pln->vl = vl;
338      pln->ivs_by_nbuf = ivs * nbuf;
339      pln->ovs_by_nbuf = ovs * nbuf;
340      pln->roffset = roffset;
341      pln->ioffset = ioffset;
342 
343      pln->nbuf = nbuf;
344      pln->bufdist = bufdist;
345 
346      {
347 	  opcnt t;
348 	  X(ops_add)(&cld->ops, &cldcpy->ops, &t);
349 	  X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
350      }
351 
352      return &(pln->super.super);
353 
354  nada:
355      X(ifree0)(bufs);
356      X(plan_destroy_internal)(cldrest);
357      X(plan_destroy_internal)(cldcpy);
358      X(plan_destroy_internal)(cld);
359      return (plan *) 0;
360 }
361 
mksolver(size_t maxnbuf_ndx)362 static solver *mksolver(size_t maxnbuf_ndx)
363 {
364      static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
365      S *slv = MKSOLVER(S, &sadt);
366      slv->maxnbuf_ndx = maxnbuf_ndx;
367      return &(slv->super);
368 }
369 
X(rdft2_buffered_register)370 void X(rdft2_buffered_register)(planner *p)
371 {
372      size_t i;
373      for (i = 0; i < NELEM(maxnbufs); ++i)
374 	  REGISTER_SOLVER(p, mksolver(i));
375 }
376