1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 
22 #include "dft/dft.h"
23 
24 typedef struct {
25      solver super;
26      size_t maxnbuf_ndx;
27 } S;
28 
29 static const INT maxnbufs[] = { 8, 256 };
30 
31 typedef struct {
32      plan_dft super;
33 
34      plan *cld, *cldcpy, *cldrest;
35      INT n, vl, nbuf, bufdist;
36      INT ivs_by_nbuf, ovs_by_nbuf;
37      INT roffset, ioffset;
38 } P;
39 
40 /* transform a vector input with the help of bufs */
apply(const plan * ego_,R * ri,R * ii,R * ro,R * io)41 static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
42 {
43      const P *ego = (const P *) ego_;
44      INT nbuf = ego->nbuf;
45      R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS);
46 
47      plan_dft *cld = (plan_dft *) ego->cld;
48      plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
49      plan_dft *cldrest;
50      INT i, vl = ego->vl;
51      INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
52      INT roffset = ego->roffset, ioffset = ego->ioffset;
53 
54      for (i = nbuf; i <= vl; i += nbuf) {
55           /* transform to bufs: */
56           cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset);
57 	  ri += ivs_by_nbuf; ii += ivs_by_nbuf;
58 
59           /* copy back */
60           cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io);
61 	  ro += ovs_by_nbuf; io += ovs_by_nbuf;
62      }
63 
64      X(ifree)(bufs);
65 
66      /* Do the remaining transforms, if any: */
67      cldrest = (plan_dft *) ego->cldrest;
68      cldrest->apply((plan *) cldrest, ri, ii, ro, io);
69 }
70 
71 
awake(plan * ego_,enum wakefulness wakefulness)72 static void awake(plan *ego_, enum wakefulness wakefulness)
73 {
74      P *ego = (P *) ego_;
75 
76      X(plan_awake)(ego->cld, wakefulness);
77      X(plan_awake)(ego->cldcpy, wakefulness);
78      X(plan_awake)(ego->cldrest, wakefulness);
79 }
80 
destroy(plan * ego_)81 static void destroy(plan *ego_)
82 {
83      P *ego = (P *) ego_;
84      X(plan_destroy_internal)(ego->cldrest);
85      X(plan_destroy_internal)(ego->cldcpy);
86      X(plan_destroy_internal)(ego->cld);
87 }
88 
print(const plan * ego_,printer * p)89 static void print(const plan *ego_, printer *p)
90 {
91      const P *ego = (const P *) ego_;
92      p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
93               ego->n, ego->nbuf,
94               ego->vl, ego->bufdist % ego->n,
95               ego->cld, ego->cldcpy, ego->cldrest);
96 }
97 
applicable0(const S * ego,const problem * p_,const planner * plnr)98 static int applicable0(const S *ego, const problem *p_, const planner *plnr)
99 {
100      const problem_dft *p = (const problem_dft *) p_;
101      const iodim *d = p->sz->dims;
102 
103      if (1
104 	 && p->vecsz->rnk <= 1
105 	 && p->sz->rnk == 1
106 	  ) {
107 	  INT vl, ivs, ovs;
108 	  X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
109 
110 	  if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
111 	       return 0;
112 
113 	  /* if this solver is redundant, in the sense that a solver
114 	     of lower index generates the same plan, then prune this
115 	     solver */
116 	  if (X(nbuf_redundant)(d[0].n, vl,
117 				ego->maxnbuf_ndx,
118 				maxnbufs, NELEM(maxnbufs)))
119 	       return 0;
120 
121 	  /*
122 	    In principle, the buffered transforms might be useful
123 	    when working out of place.  However, in order to
124 	    prevent infinite loops in the planner, we require
125 	    that the output stride of the buffered transforms be
126 	    greater than 2.
127 	  */
128 	  if (p->ri != p->ro)
129 	       return (d[0].os > 2);
130 
131 	  /*
132 	   * If the problem is in place, the input/output strides must
133 	   * be the same or the whole thing must fit in the buffer.
134 	   */
135 	  if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
136 	       return 1;
137 
138 	  if (/* fits into buffer: */
139 	       ((p->vecsz->rnk == 0)
140 		||
141 		(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
142 			 maxnbufs[ego->maxnbuf_ndx])
143 		 == p->vecsz->dims[0].n)))
144 	       return 1;
145      }
146 
147      return 0;
148 }
149 
applicable(const S * ego,const problem * p_,const planner * plnr)150 static int applicable(const S *ego, const problem *p_, const planner *plnr)
151 {
152      if (NO_BUFFERINGP(plnr)) return 0;
153      if (!applicable0(ego, p_, plnr)) return 0;
154 
155      if (NO_UGLYP(plnr)) {
156 	  const problem_dft *p = (const problem_dft *) p_;
157 	  if (p->ri != p->ro) return 0;
158 	  if (X(toobig)(p->sz->dims[0].n)) return 0;
159      }
160      return 1;
161 }
162 
mkplan(const solver * ego_,const problem * p_,planner * plnr)163 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
164 {
165      P *pln;
166      const S *ego = (const S *)ego_;
167      plan *cld = (plan *) 0;
168      plan *cldcpy = (plan *) 0;
169      plan *cldrest = (plan *) 0;
170      const problem_dft *p = (const problem_dft *) p_;
171      R *bufs = (R *) 0;
172      INT nbuf = 0, bufdist, n, vl;
173      INT ivs, ovs, roffset, ioffset;
174 
175      static const plan_adt padt = {
176 	  X(dft_solve), awake, print, destroy
177      };
178 
179      if (!applicable(ego, p_, plnr))
180           goto nada;
181 
182      n = X(tensor_sz)(p->sz);
183 
184      X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
185 
186      nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
187      bufdist = X(bufdist)(n, vl);
188      A(nbuf > 0);
189 
190      /* attempt to keep real and imaginary part in the same order,
191 	so as to allow optimizations in the the copy plan */
192      roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0;
193      ioffset = 1 - roffset;
194 
195      /* initial allocation for the purpose of planning */
196      bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist * 2, BUFFERS);
197 
198      /* allow destruction of input if problem is in place */
199      cld = X(mkplan_f_d)(plnr,
200 			 X(mkproblem_dft_d)(
201 			      X(mktensor_1d)(n, p->sz->dims[0].is, 2),
202 			      X(mktensor_1d)(nbuf, ivs, bufdist * 2),
203 			      TAINT(p->ri, ivs * nbuf),
204 			      TAINT(p->ii, ivs * nbuf),
205 			      bufs + roffset,
206 			      bufs + ioffset),
207 			 0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0);
208      if (!cld)
209           goto nada;
210 
211      /* copying back from the buffer is a rank-0 transform: */
212      cldcpy = X(mkplan_d)(plnr,
213 			  X(mkproblem_dft_d)(
214 			       X(mktensor_0d)(),
215 			       X(mktensor_2d)(nbuf, bufdist * 2, ovs,
216 					      n, 2, p->sz->dims[0].os),
217 			       bufs + roffset,
218 			       bufs + ioffset,
219 			       TAINT(p->ro, ovs * nbuf),
220 			       TAINT(p->io, ovs * nbuf)));
221      if (!cldcpy)
222           goto nada;
223 
224      /* deallocate buffers, let apply() allocate them for real */
225      X(ifree)(bufs);
226      bufs = 0;
227 
228      /* plan the leftover transforms (cldrest): */
229      {
230 	  INT id = ivs * (nbuf * (vl / nbuf));
231 	  INT od = ovs * (nbuf * (vl / nbuf));
232 	  cldrest = X(mkplan_d)(plnr,
233 				X(mkproblem_dft_d)(
234 				     X(tensor_copy)(p->sz),
235 				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
236 				     p->ri+id, p->ii+id, p->ro+od, p->io+od));
237      }
238      if (!cldrest)
239           goto nada;
240 
241      pln = MKPLAN_DFT(P, &padt, apply);
242      pln->cld = cld;
243      pln->cldcpy = cldcpy;
244      pln->cldrest = cldrest;
245      pln->n = n;
246      pln->vl = vl;
247      pln->ivs_by_nbuf = ivs * nbuf;
248      pln->ovs_by_nbuf = ovs * nbuf;
249      pln->roffset = roffset;
250      pln->ioffset = ioffset;
251 
252      pln->nbuf = nbuf;
253      pln->bufdist = bufdist;
254 
255      {
256 	  opcnt t;
257 	  X(ops_add)(&cld->ops, &cldcpy->ops, &t);
258 	  X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
259      }
260 
261      return &(pln->super.super);
262 
263  nada:
264      X(ifree0)(bufs);
265      X(plan_destroy_internal)(cldrest);
266      X(plan_destroy_internal)(cldcpy);
267      X(plan_destroy_internal)(cld);
268      return (plan *) 0;
269 }
270 
mksolver(size_t maxnbuf_ndx)271 static solver *mksolver(size_t maxnbuf_ndx)
272 {
273      static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
274      S *slv = MKSOLVER(S, &sadt);
275      slv->maxnbuf_ndx = maxnbuf_ndx;
276      return &(slv->super);
277 }
278 
X(dft_buffered_register)279 void X(dft_buffered_register)(planner *p)
280 {
281      size_t i;
282      for (i = 0; i < NELEM(maxnbufs); ++i)
283 	  REGISTER_SOLVER(p, mksolver(i));
284 }
285