1 /*
2  * Copyright © 2004 Ondra Kamenik
3  * Copyright © 2019 Dynare Team
4  *
5  * This file is part of Dynare.
6  *
7  * Dynare is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Dynare is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "stack_container.hh"
22 #include "pyramid_prod2.hh"
23 #include "ps_tensor.hh"
24 
25 #include <memory>
26 
27 // FoldedStackContainer::multAndAdd() sparse code
28 /* Here we multiply the sparse tensor with the FoldedStackContainer. We have
29    four implementations, multAndAddSparse1(), multAndAddSparse2(),
30    multAndAddSparse3(), and multAndAddSparse4(). The third is not threaded yet
31    and I expect that it is certainly the slowest. The multAndAddSparse4()
32    exploits the sparsity, however, it seems to be still worse than
33    multAndAddSparse2() even for really sparse matrices. On the other hand, it
34    can be more efficient than multAndAddSparse2() for large problems, since it
35    does not need that much of memory and can avoid much swapping. Very
36    preliminary examination shows that multAndAddSparse2() is the best in terms
37    of time. */
38 void
multAndAdd(const FSSparseTensor & t,FGSTensor & out) const39 FoldedStackContainer::multAndAdd(const FSSparseTensor &t,
40                                  FGSTensor &out) const
41 {
42   TL_RAISE_IF(t.nvar() != getAllSize(),
43               "Wrong number of variables of tensor for FoldedStackContainer::multAndAdd");
44   multAndAddSparse2(t, out);
45 }
46 
47 // FoldedStackContainer::multAndAdd() dense code
48 /* Here we perform the Faà Di Bruno step for a given dimension ‘dim’, and for
49    the dense fully symmetric tensor which is scattered in the container
50    of general symmetric tensors. The implementation is pretty the same as
51    UnfoldedStackContainer::multAndAdd() dense code. */
52 void
multAndAdd(int dim,const FGSContainer & c,FGSTensor & out) const53 FoldedStackContainer::multAndAdd(int dim, const FGSContainer &c, FGSTensor &out) const
54 {
55   TL_RAISE_IF(c.num() != numStacks(),
56               "Wrong symmetry length of container for FoldedStackContainer::multAndAdd");
57 
58   sthread::detach_thread_group gr;
59 
60   for (auto &si : SymmetrySet(dim, c.num()))
61     if (c.check(si))
62       gr.insert(std::make_unique<WorkerFoldMAADense>(*this, si, c, out));
63 
64   gr.run();
65 }
66 
67 /* This is analogous to WorkerUnfoldMAADense::operator()() code. */
68 
69 void
operator ()(std::mutex & mut)70 WorkerFoldMAADense::operator()(std::mutex &mut)
71 {
72   Permutation iden(dense_cont.num());
73   IntSequence coor(iden.getMap().unfold(sym));
74   const FGSTensor &g = dense_cont.get(sym);
75   cont.multAndAddStacks(coor, g, out, mut);
76 }
77 
WorkerFoldMAADense(const FoldedStackContainer & container,Symmetry s,const FGSContainer & dcontainer,FGSTensor & outten)78 WorkerFoldMAADense::WorkerFoldMAADense(const FoldedStackContainer &container,
79                                        Symmetry s,
80                                        const FGSContainer &dcontainer,
81                                        FGSTensor &outten)
82   : cont(container), sym(std::move(s)), dense_cont(dcontainer), out(outten)
83 {
84 }
85 
86 /* This is analogous to UnfoldedStackContainer::multAndAddSparse1() code. */
87 void
multAndAddSparse1(const FSSparseTensor & t,FGSTensor & out) const88 FoldedStackContainer::multAndAddSparse1(const FSSparseTensor &t,
89                                         FGSTensor &out) const
90 {
91   sthread::detach_thread_group gr;
92   UFSTensor dummy(0, numStacks(), t.dimen());
93   for (Tensor::index ui = dummy.begin(); ui != dummy.end(); ++ui)
94     gr.insert(std::make_unique<WorkerFoldMAASparse1>(*this, t, out, ui.getCoor()));
95 
96   gr.run();
97 }
98 
99 /* This is analogous to WorkerUnfoldMAASparse1::operator()() code.
100    The only difference is that instead of UPSTensor as a
101    result of multiplication of unfolded tensor and tensors from
102    containers, we have FPSTensor with partially folded permuted
103    symmetry.
104 
105    TODO: make slice vertically narrowed according to the fill of t,
106    vertically narrow out accordingly. */
107 
108 void
operator ()(std::mutex & mut)109 WorkerFoldMAASparse1::operator()(std::mutex &mut)
110 {
111   const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
112   const PermutationSet &pset = TLStatic::getPerm(t.dimen());
113   Permutation iden(t.dimen());
114 
115   UPSTensor slice(t, cont.getStackSizes(), coor,
116                   PerTensorDimens(cont.getStackSizes(), coor));
117   for (int iper = 0; iper < pset.getNum(); iper++)
118     {
119       const Permutation &per = pset.get(iper);
120       IntSequence percoor(coor.size());
121       per.apply(coor, percoor);
122       for (const auto &it : eset)
123         {
124           if (it.numClasses() == t.dimen())
125             {
126               StackProduct<FGSTensor> sp(cont, it, out.getSym());
127               if (!sp.isZero(percoor))
128                 {
129                   KronProdStack<FGSTensor> kp(sp, percoor);
130                   kp.optimizeOrder();
131                   const Permutation &oper = kp.getPer();
132                   if (Permutation(oper, per) == iden)
133                     {
134                       FPSTensor fps(out.getDims(), it, slice, kp);
135                       {
136                         std::unique_lock<std::mutex> lk{mut};
137                         fps.addTo(out);
138                       }
139                     }
140                 }
141             }
142         }
143     }
144 }
145 
WorkerFoldMAASparse1(const FoldedStackContainer & container,const FSSparseTensor & ten,FGSTensor & outten,IntSequence c)146 WorkerFoldMAASparse1::WorkerFoldMAASparse1(const FoldedStackContainer &container,
147                                            const FSSparseTensor &ten,
148                                            FGSTensor &outten, IntSequence c)
149   : cont(container), t(ten), out(outten), coor(std::move(c))
150 {
151 }
152 
153 /* Here is the second implementation of sparse folded multAndAdd(). It
154    is pretty similar to implementation of
155    UnfoldedStackContainer::multAndAddSparse2() code. We make a
156    dense folded slice, and then call folded multAndAddStacks(), which
157    multiplies all the combinations compatible with the slice. */
158 
159 void
multAndAddSparse2(const FSSparseTensor & t,FGSTensor & out) const160 FoldedStackContainer::multAndAddSparse2(const FSSparseTensor &t,
161                                         FGSTensor &out) const
162 {
163   sthread::detach_thread_group gr;
164   FFSTensor dummy_f(0, numStacks(), t.dimen());
165   for (Tensor::index fi = dummy_f.begin(); fi != dummy_f.end(); ++fi)
166     gr.insert(std::make_unique<WorkerFoldMAASparse2>(*this, t, out, fi.getCoor()));
167 
168   gr.run();
169 }
170 
171 /* Here we make a sparse slice first and then call multAndAddStacks()
172    if the slice is not empty. If the slice is really sparse, we call
173    sparse version of multAndAddStacks(). What means “really sparse” is
174    given by ‘fill_threshold’. It is not tuned yet, a practice shows that
175    it must be a really low number, since sparse multAndAddStacks() is
176    much slower than the dense version.
177 
178    Further, we take only nonzero rows of the slice, and accordingly of
179    the out tensor. We jump over zero initial rows and drop zero tailing
180    rows. */
181 
182 void
operator ()(std::mutex & mut)183 WorkerFoldMAASparse2::operator()(std::mutex &mut)
184 {
185   GSSparseTensor slice(t, cont.getStackSizes(), coor,
186                        TensorDimens(cont.getStackSizes(), coor));
187   if (slice.getNumNonZero())
188     {
189       if (slice.getUnfoldIndexFillFactor() > FoldedStackContainer::fill_threshold)
190         {
191           FGSTensor dense_slice(slice);
192           int r1 = slice.getFirstNonZeroRow();
193           int r2 = slice.getLastNonZeroRow();
194           FGSTensor dense_slice1(r1, r2-r1+1, dense_slice);
195           FGSTensor out1(r1, r2-r1+1, out);
196           cont.multAndAddStacks(coor, dense_slice1, out1, mut);
197         }
198       else
199         cont.multAndAddStacks(coor, slice, out, mut);
200     }
201 }
202 
WorkerFoldMAASparse2(const FoldedStackContainer & container,const FSSparseTensor & ten,FGSTensor & outten,IntSequence c)203 WorkerFoldMAASparse2::WorkerFoldMAASparse2(const FoldedStackContainer &container,
204                                            const FSSparseTensor &ten,
205                                            FGSTensor &outten, IntSequence c)
206   : cont(container), t(ten), out(outten), coor(std::move(c))
207 {
208 }
209 
210 /* Here is the third implementation of the sparse folded
211    multAndAdd(). It is column-wise implementation, and thus is not a good
212    candidate for the best performer.
213 
214    We go through all columns from the output. For each column we
215    calculate folded ‘sumcol’ which is a sum of all appropriate columns
216    for all suitable equivalences. So we go through all suitable
217    equivalences, for each we construct a StackProduct object and
218    construct IrregTensor for a corresponding column of z. The
219    IrregTensor is an abstraction for Kronecker multiplication of
220    stacked columns of the two containers without zeros. Then the column
221    is added to ‘sumcol’. Finally, the ‘sumcol’ is multiplied by the
222    sparse tensor. */
223 
224 void
multAndAddSparse3(const FSSparseTensor & t,FGSTensor & out) const225 FoldedStackContainer::multAndAddSparse3(const FSSparseTensor &t,
226                                         FGSTensor &out) const
227 {
228   const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
229   for (Tensor::index run = out.begin(); run != out.end(); ++run)
230     {
231       Vector outcol{out.getCol(*run)};
232       FRSingleTensor sumcol(t.nvar(), t.dimen());
233       sumcol.zeros();
234       for (const auto &it : eset)
235         if (it.numClasses() == t.dimen())
236           {
237             StackProduct<FGSTensor> sp(*this, it, out.getSym());
238             IrregTensorHeader header(sp, run.getCoor());
239             IrregTensor irten(header);
240             irten.addTo(sumcol);
241           }
242       t.multColumnAndAdd(sumcol, outcol);
243     }
244 }
245 
246 /* Here is the fourth implementation of sparse
247    FoldedStackContainer::multAndAdd(). It is almost equivalent to
248    multAndAddSparse2() with the exception that the FPSTensor as a
249    result of a product of a slice and Kronecker product of the stack
250    derivatives is calculated in the sparse fashion. For further details, see
251    FoldedStackContainer::multAndAddStacks() sparse code and
252    FPSTensor| sparse constructor. */
253 
254 void
multAndAddSparse4(const FSSparseTensor & t,FGSTensor & out) const255 FoldedStackContainer::multAndAddSparse4(const FSSparseTensor &t, FGSTensor &out) const
256 {
257   sthread::detach_thread_group gr;
258   FFSTensor dummy_f(0, numStacks(), t.dimen());
259   for (Tensor::index fi = dummy_f.begin(); fi != dummy_f.end(); ++fi)
260     gr.insert(std::make_unique<WorkerFoldMAASparse4>(*this, t, out, fi.getCoor()));
261 
262   gr.run();
263 }
264 
265 /* The WorkerFoldMAASparse4 is the same as WorkerFoldMAASparse2
266    with the exception that we call a sparse version of
267    multAndAddStacks(). */
268 
269 void
operator ()(std::mutex & mut)270 WorkerFoldMAASparse4::operator()(std::mutex &mut)
271 {
272   GSSparseTensor slice(t, cont.getStackSizes(), coor,
273                        TensorDimens(cont.getStackSizes(), coor));
274   if (slice.getNumNonZero())
275     cont.multAndAddStacks(coor, slice, out, mut);
276 }
277 
WorkerFoldMAASparse4(const FoldedStackContainer & container,const FSSparseTensor & ten,FGSTensor & outten,IntSequence c)278 WorkerFoldMAASparse4::WorkerFoldMAASparse4(const FoldedStackContainer &container,
279                                            const FSSparseTensor &ten,
280                                            FGSTensor &outten, IntSequence c)
281   : cont(container), t(ten), out(outten), coor(std::move(c))
282 {
283 }
284 
285 // FoldedStackContainer::multAndAddStacks() dense code
286 /* This is almost the same as UnfoldedStackContainer::multAndAddStacks() code.
287    The only difference is that we do not construct a UPSTensor from
288    KronProdStack, but we construct partially folded permuted symmetry
289    FPSTensor. Note that the tensor ‘g’ must be unfolded in order to be able to
290    multiply with unfolded rows of Kronecker product. However, columns of such a
291    product are partially folded giving a rise to the FPSTensor. */
292 void
multAndAddStacks(const IntSequence & coor,const FGSTensor & g,FGSTensor & out,std::mutex & mut) const293 FoldedStackContainer::multAndAddStacks(const IntSequence &coor,
294                                        const FGSTensor &g,
295                                        FGSTensor &out, std::mutex &mut) const
296 {
297   const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
298 
299   UGSTensor ug(g);
300   UFSTensor dummy_u(0, numStacks(), g.dimen());
301   for (Tensor::index ui = dummy_u.begin(); ui != dummy_u.end(); ++ui)
302     {
303       IntSequence tmp(ui.getCoor());
304       tmp.sort();
305       if (tmp == coor)
306         {
307           Permutation sort_per(ui.getCoor());
308           sort_per.inverse();
309           for (const auto &it : eset)
310             if (it.numClasses() == g.dimen())
311               {
312                 StackProduct<FGSTensor> sp(*this, it, sort_per, out.getSym());
313                 if (!sp.isZero(coor))
314                   {
315                     KronProdStack<FGSTensor> kp(sp, coor);
316                     if (ug.getSym().isFull())
317                       kp.optimizeOrder();
318                     FPSTensor fps(out.getDims(), it, sort_per, ug, kp);
319                     {
320                       std::unique_lock<std::mutex> lk{mut};
321                       fps.addTo(out);
322                     }
323                   }
324               }
325         }
326     }
327 }
328 
329 // FoldedStackContainer::multAndAddStacks() sparse code
330 /* This is almost the same as FoldedStackContainer::multAndAddStacks() dense code. The only
331    difference is that the Kronecker product of the stacks is multiplied
332    with sparse slice GSSparseTensor (not dense slice FGSTensor). The
333    multiplication is done in FPSTensor sparse constructor. */
334 void
multAndAddStacks(const IntSequence & coor,const GSSparseTensor & g,FGSTensor & out,std::mutex & mut) const335 FoldedStackContainer::multAndAddStacks(const IntSequence &coor,
336                                        const GSSparseTensor &g,
337                                        FGSTensor &out, std::mutex &mut) const
338 {
339   const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
340   UFSTensor dummy_u(0, numStacks(), g.dimen());
341   for (Tensor::index ui = dummy_u.begin(); ui != dummy_u.end(); ++ui)
342     {
343       IntSequence tmp(ui.getCoor());
344       tmp.sort();
345       if (tmp == coor)
346         {
347           Permutation sort_per(ui.getCoor());
348           sort_per.inverse();
349           for (const auto &it : eset)
350             if (it.numClasses() == g.dimen())
351               {
352                 StackProduct<FGSTensor> sp(*this, it, sort_per, out.getSym());
353                 if (!sp.isZero(coor))
354                   {
355                     KronProdStack<FGSTensor> kp(sp, coor);
356                     FPSTensor fps(out.getDims(), it, sort_per, g, kp);
357                     {
358                       std::unique_lock<std::mutex> lk{mut};
359                       fps.addTo(out);
360                     }
361                   }
362               }
363         }
364     }
365 }
366 
367 // UnfoldedStackContainer::multAndAdd() sparse code
368 /*  Here we simply call either multAndAddSparse1() or
369     multAndAddSparse2(). The first one allows for optimization of
370     Kronecker products, so it seems to be more efficient. */
371 void
multAndAdd(const FSSparseTensor & t,UGSTensor & out) const372 UnfoldedStackContainer::multAndAdd(const FSSparseTensor &t,
373                                    UGSTensor &out) const
374 {
375   TL_RAISE_IF(t.nvar() != getAllSize(),
376               "Wrong number of variables of tensor for UnfoldedStackContainer::multAndAdd");
377   multAndAddSparse2(t, out);
378 }
379 
380 // UnfoldedStackContainer::multAndAdd() dense code
381 /* Here we implement the formula for stacks for fully symmetric tensor
382    scattered in a number of general symmetry tensors contained in a given
383    container. The implementations is pretty the same as in
384    multAndAddSparse2() but we do not do the slices of sparse tensor, but
385    only a lookup to the container.
386 
387    This means that we do not iterate through a dummy folded tensor to
388    obtain folded coordinates of stacks, rather we iterate through all
389    symmetries contained in the container and the coordinates of stacks
390    are obtained as unfolded identity sequence via the symmetry. The
391    reason of doing this is that we are unable to calculate symmetry from
392    stack coordinates as easily as stack coordinates from the symmetry. */
393 void
multAndAdd(int dim,const UGSContainer & c,UGSTensor & out) const394 UnfoldedStackContainer::multAndAdd(int dim, const UGSContainer &c,
395                                    UGSTensor &out) const
396 {
397   TL_RAISE_IF(c.num() != numStacks(),
398               "Wrong symmetry length of container for UnfoldedStackContainer::multAndAdd");
399 
400   sthread::detach_thread_group gr;
401   for (auto &si : SymmetrySet(dim, c.num()))
402     if (c.check(si))
403       gr.insert(std::make_unique<WorkerUnfoldMAADense>(*this, si, c, out));
404 
405   gr.run();
406 }
407 
408 void
operator ()(std::mutex & mut)409 WorkerUnfoldMAADense::operator()(std::mutex &mut)
410 {
411   Permutation iden(dense_cont.num());
412   IntSequence coor(iden.getMap().unfold(sym));
413   const UGSTensor &g = dense_cont.get(sym);
414   cont.multAndAddStacks(coor, g, out, mut);
415 }
416 
WorkerUnfoldMAADense(const UnfoldedStackContainer & container,Symmetry s,const UGSContainer & dcontainer,UGSTensor & outten)417 WorkerUnfoldMAADense::WorkerUnfoldMAADense(const UnfoldedStackContainer &container,
418                                            Symmetry s,
419                                            const UGSContainer &dcontainer,
420                                            UGSTensor &outten)
421   : cont(container), sym(std::move(s)), dense_cont(dcontainer), out(outten)
422 {
423 }
424 
425 /* Here we implement the formula for unfolded tensors. If, for instance,
426    a coordinate z of a tensor [f_z²] is partitioned as
427    z=(a, b), then we perform the following:
428 
429                ⎛a_c(x)⎞ ⎛a_c(y)⎞
430     [f_z²] · ∑ ⎢      ⎥⊗⎢      ⎥ = [f_aa] · ∑ a_c(x)⊗a_c(y) + [f_ab] · ∑ a_c(x)⊗b_c(y)
431              ᶜ ⎝b_c(x)⎠ ⎝b_c(y)⎠            ᶜ                          ᶜ
432 
433                                  + [f_ba] · ∑ b_c(x)⊗a_c(y) + [f_bb] · ∑ b_c(x)⊗b_c(y)
434 
435    This is exactly what happens here. The code is clear. It goes through
436    all combinations of stacks, and each thread is responsible for
437    operation for the slice corresponding to the combination of the stacks. */
438 
439 void
multAndAddSparse1(const FSSparseTensor & t,UGSTensor & out) const440 UnfoldedStackContainer::multAndAddSparse1(const FSSparseTensor &t,
441                                           UGSTensor &out) const
442 {
443   sthread::detach_thread_group gr;
444   UFSTensor dummy(0, numStacks(), t.dimen());
445   for (Tensor::index ui = dummy.begin(); ui != dummy.end(); ++ui)
446     gr.insert(std::make_unique<WorkerUnfoldMAASparse1>(*this, t, out, ui.getCoor()));
447 
448   gr.run();
449 }
450 
451 /* This does a step of UnfoldedStackContainer::multAndAddSparse1() for
452    a given coordinates. First it makes the slice of the given stack coordinates.
453    Then it multiplies everything what should be multiplied with the slice.
454    That is it goes through all equivalences, creates StackProduct, then
455    KronProdStack, which is added to ‘out’. So far everything is clear.
456 
457    However, we want to use optimized KronProdAllOptim to minimize a number of
458    flops and memory needed in the Kronecker product. So we go through all
459    permutations ‘per’, permute the coordinates to get ‘percoor’, go through all
460    equivalences, and make KronProdStack and optimize it. The result of
461    optimization is a permutation ‘oper’. Now, we multiply the Kronecker product
462    with the slice, only if the slice has the same ordering of coordinates as
463    the Kronecker product KronProdStack. However, it is not perfectly true.
464    Since we go through *all* permutations ‘per’, there might be two different
465    permutations leading to the same ordering in KronProdStack and thus the same
466    ordering in the optimized KronProdStack. The two cases would be counted
467    twice, which is wrong. That is why we do not condition on
468    coor∘oper∘per = coor, but we condition on oper∘per = id. In this way, we
469    rule out permutations ‘per’ leading to the same ordering of stacks when
470    applied on ‘coor’.
471 
472    TODO: vertically narrow slice and out according to the fill in t. */
473 
474 void
operator ()(std::mutex & mut)475 WorkerUnfoldMAASparse1::operator()(std::mutex &mut)
476 {
477   const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
478   const PermutationSet &pset = TLStatic::getPerm(t.dimen());
479   Permutation iden(t.dimen());
480 
481   UPSTensor slice(t, cont.getStackSizes(), coor,
482                   PerTensorDimens(cont.getStackSizes(), coor));
483   for (int iper = 0; iper < pset.getNum(); iper++)
484     {
485       const Permutation &per = pset.get(iper);
486       IntSequence percoor(coor.size());
487       per.apply(coor, percoor);
488       for (const auto &it : eset)
489         if (it.numClasses() == t.dimen())
490           {
491             StackProduct<UGSTensor> sp(cont, it, out.getSym());
492             if (!sp.isZero(percoor))
493               {
494                 KronProdStack<UGSTensor> kp(sp, percoor);
495                 kp.optimizeOrder();
496                 const Permutation &oper = kp.getPer();
497                 if (Permutation(oper, per) == iden)
498                   {
499                     UPSTensor ups(out.getDims(), it, slice, kp);
500                     {
501                       std::unique_lock<std::mutex> lk{mut};
502                       ups.addTo(out);
503                     }
504                   }
505               }
506           }
507     }
508 }
509 
WorkerUnfoldMAASparse1(const UnfoldedStackContainer & container,const FSSparseTensor & ten,UGSTensor & outten,IntSequence c)510 WorkerUnfoldMAASparse1::WorkerUnfoldMAASparse1(const UnfoldedStackContainer &container,
511                                                const FSSparseTensor &ten,
512                                                UGSTensor &outten, IntSequence c)
513   : cont(container), t(ten), out(outten), coor(std::move(c))
514 {
515 }
516 
517 /* In here we implement the formula by a bit different way. We use the
518    fact, using notation of UnfoldedStackContainer::multAndAddSparse2(),
519    that
520                                       ⎛               ⎞
521    [f_ba] · ∑ b_c(x)⊗a_c(y) = [f_ba] ·⎢∑ a_c(y)⊗b_c(x)⎥· P
522             ᶜ                         ⎝ᶜ              ⎠
523 
524    where P is a suitable permutation of columns. The permutation
525    corresponds to (in this example) a swap of a and b. An advantage
526    of this approach is that we do not need UPSTensor for [f_ba], and
527    thus we decrease the number of needed slices.
528 
529    So we go through all folded indices of stack coordinates, then for
530    each such index ‘fi’ we make a slice and call multAndAddStacks(). This
531    goes through all corresponding unfolded indices to perform the
532    formula. Each unsorted (unfold) index implies a sorting permutation
533    ‘sort_per’ which must be used to permute stacks in StackProduct, and
534    permute equivalence classes when UPSTensor is formed. In this way
535    the column permutation P from the formula is factored to the
536    permutation of UPSTensor. */
537 
538 void
multAndAddSparse2(const FSSparseTensor & t,UGSTensor & out) const539 UnfoldedStackContainer::multAndAddSparse2(const FSSparseTensor &t,
540                                           UGSTensor &out) const
541 {
542   sthread::detach_thread_group gr;
543   FFSTensor dummy_f(0, numStacks(), t.dimen());
544   for (Tensor::index fi = dummy_f.begin(); fi != dummy_f.end(); ++fi)
545     gr.insert(std::make_unique<WorkerUnfoldMAASparse2>(*this, t, out, fi.getCoor()));
546 
547   gr.run();
548 }
549 
550 /* This does a step of UnfoldedStackContainer::multAndAddSparse2() for a given
551    coordinates.
552 
553    TODO: implement multAndAddStacks() for sparse slice as
554    FoldedStackContainer::multAndAddStacks() sparse code and do this method as
555    WorkerFoldMAASparse2::operator()(). */
556 
557 void
operator ()(std::mutex & mut)558 WorkerUnfoldMAASparse2::operator()(std::mutex &mut)
559 {
560   GSSparseTensor slice(t, cont.getStackSizes(), coor,
561                        TensorDimens(cont.getStackSizes(), coor));
562   if (slice.getNumNonZero())
563     {
564       FGSTensor fslice(slice);
565       UGSTensor dense_slice(fslice);
566       int r1 = slice.getFirstNonZeroRow();
567       int r2 = slice.getLastNonZeroRow();
568       UGSTensor dense_slice1(r1, r2-r1+1, dense_slice);
569       UGSTensor out1(r1, r2-r1+1, out);
570 
571       cont.multAndAddStacks(coor, dense_slice1, out1, mut);
572     }
573 }
574 
WorkerUnfoldMAASparse2(const UnfoldedStackContainer & container,const FSSparseTensor & ten,UGSTensor & outten,IntSequence c)575 WorkerUnfoldMAASparse2::WorkerUnfoldMAASparse2(const UnfoldedStackContainer &container,
576                                                const FSSparseTensor &ten,
577                                                UGSTensor &outten, IntSequence c)
578   : cont(container), t(ten), out(outten), coor(std::move(c))
579 {
580 }
581 
582 /* For a given unfolded coordinates of stacks ‘fi’, and appropriate
583    tensor g, whose symmetry is a symmetry of ‘fi’, the method
584    contributes to ‘out’ all tensors in unfolded stack formula involving
585    stacks chosen by ‘fi’.
586 
587    We go through all ‘ui’ coordinates which yield ‘fi’ after sorting. We
588    construct a permutation ‘sort_per’ which sorts ‘ui’ to ‘fi’. We go
589    through all appropriate equivalences, and construct StackProduct
590    from equivalence classes permuted by ‘sort_per’, then UPSTensor with
591    implied permutation of columns by the permuted equivalence by
592    ‘sort_per’. The UPSTensor is then added to ‘out’.
593 
594    We cannot use here the optimized KronProdStack, since the symmetry
595    of ‘UGSTensor& g’ prescribes the ordering of the stacks. However, if
596    ‘g’ is fully symmetric, we can do the optimization harmlessly. */
597 
598 void
multAndAddStacks(const IntSequence & fi,const UGSTensor & g,UGSTensor & out,std::mutex & mut) const599 UnfoldedStackContainer::multAndAddStacks(const IntSequence &fi,
600                                          const UGSTensor &g,
601                                          UGSTensor &out, std::mutex &mut) const
602 {
603   const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
604 
605   UFSTensor dummy_u(0, numStacks(), g.dimen());
606   for (Tensor::index ui = dummy_u.begin(); ui != dummy_u.end(); ++ui)
607     {
608       IntSequence tmp(ui.getCoor());
609       tmp.sort();
610       if (tmp == fi)
611         {
612           Permutation sort_per(ui.getCoor());
613           sort_per.inverse();
614           for (const auto &it : eset)
615             if (it.numClasses() == g.dimen())
616               {
617                 StackProduct<UGSTensor> sp(*this, it, sort_per, out.getSym());
618                 if (!sp.isZero(fi))
619                   {
620                     KronProdStack<UGSTensor> kp(sp, fi);
621                     if (g.getSym().isFull())
622                       kp.optimizeOrder();
623                     UPSTensor ups(out.getDims(), it, sort_per, g, kp);
624                     {
625                       std::unique_lock<std::mutex> lk{mut};
626                       ups.addTo(out);
627                     }
628                   }
629               }
630         }
631     }
632 }
633