1 /*
2 * Copyright © 2004 Ondra Kamenik
3 * Copyright © 2019 Dynare Team
4 *
5 * This file is part of Dynare.
6 *
7 * Dynare is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * Dynare is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with Dynare. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "stack_container.hh"
22 #include "pyramid_prod2.hh"
23 #include "ps_tensor.hh"
24
25 #include <memory>
26
27 // FoldedStackContainer::multAndAdd() sparse code
28 /* Here we multiply the sparse tensor with the FoldedStackContainer. We have
29 four implementations, multAndAddSparse1(), multAndAddSparse2(),
30 multAndAddSparse3(), and multAndAddSparse4(). The third is not threaded yet
31 and I expect that it is certainly the slowest. The multAndAddSparse4()
32 exploits the sparsity, however, it seems to be still worse than
33 multAndAddSparse2() even for really sparse matrices. On the other hand, it
34 can be more efficient than multAndAddSparse2() for large problems, since it
35 does not need that much of memory and can avoid much swapping. Very
36 preliminary examination shows that multAndAddSparse2() is the best in terms
37 of time. */
38 void
multAndAdd(const FSSparseTensor & t,FGSTensor & out) const39 FoldedStackContainer::multAndAdd(const FSSparseTensor &t,
40 FGSTensor &out) const
41 {
42 TL_RAISE_IF(t.nvar() != getAllSize(),
43 "Wrong number of variables of tensor for FoldedStackContainer::multAndAdd");
44 multAndAddSparse2(t, out);
45 }
46
47 // FoldedStackContainer::multAndAdd() dense code
48 /* Here we perform the Faà Di Bruno step for a given dimension ‘dim’, and for
49 the dense fully symmetric tensor which is scattered in the container
50 of general symmetric tensors. The implementation is pretty the same as
51 UnfoldedStackContainer::multAndAdd() dense code. */
52 void
multAndAdd(int dim,const FGSContainer & c,FGSTensor & out) const53 FoldedStackContainer::multAndAdd(int dim, const FGSContainer &c, FGSTensor &out) const
54 {
55 TL_RAISE_IF(c.num() != numStacks(),
56 "Wrong symmetry length of container for FoldedStackContainer::multAndAdd");
57
58 sthread::detach_thread_group gr;
59
60 for (auto &si : SymmetrySet(dim, c.num()))
61 if (c.check(si))
62 gr.insert(std::make_unique<WorkerFoldMAADense>(*this, si, c, out));
63
64 gr.run();
65 }
66
67 /* This is analogous to WorkerUnfoldMAADense::operator()() code. */
68
69 void
operator ()(std::mutex & mut)70 WorkerFoldMAADense::operator()(std::mutex &mut)
71 {
72 Permutation iden(dense_cont.num());
73 IntSequence coor(iden.getMap().unfold(sym));
74 const FGSTensor &g = dense_cont.get(sym);
75 cont.multAndAddStacks(coor, g, out, mut);
76 }
77
WorkerFoldMAADense(const FoldedStackContainer & container,Symmetry s,const FGSContainer & dcontainer,FGSTensor & outten)78 WorkerFoldMAADense::WorkerFoldMAADense(const FoldedStackContainer &container,
79 Symmetry s,
80 const FGSContainer &dcontainer,
81 FGSTensor &outten)
82 : cont(container), sym(std::move(s)), dense_cont(dcontainer), out(outten)
83 {
84 }
85
86 /* This is analogous to UnfoldedStackContainer::multAndAddSparse1() code. */
87 void
multAndAddSparse1(const FSSparseTensor & t,FGSTensor & out) const88 FoldedStackContainer::multAndAddSparse1(const FSSparseTensor &t,
89 FGSTensor &out) const
90 {
91 sthread::detach_thread_group gr;
92 UFSTensor dummy(0, numStacks(), t.dimen());
93 for (Tensor::index ui = dummy.begin(); ui != dummy.end(); ++ui)
94 gr.insert(std::make_unique<WorkerFoldMAASparse1>(*this, t, out, ui.getCoor()));
95
96 gr.run();
97 }
98
99 /* This is analogous to WorkerUnfoldMAASparse1::operator()() code.
100 The only difference is that instead of UPSTensor as a
101 result of multiplication of unfolded tensor and tensors from
102 containers, we have FPSTensor with partially folded permuted
103 symmetry.
104
105 TODO: make slice vertically narrowed according to the fill of t,
106 vertically narrow out accordingly. */
107
108 void
operator ()(std::mutex & mut)109 WorkerFoldMAASparse1::operator()(std::mutex &mut)
110 {
111 const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
112 const PermutationSet &pset = TLStatic::getPerm(t.dimen());
113 Permutation iden(t.dimen());
114
115 UPSTensor slice(t, cont.getStackSizes(), coor,
116 PerTensorDimens(cont.getStackSizes(), coor));
117 for (int iper = 0; iper < pset.getNum(); iper++)
118 {
119 const Permutation &per = pset.get(iper);
120 IntSequence percoor(coor.size());
121 per.apply(coor, percoor);
122 for (const auto &it : eset)
123 {
124 if (it.numClasses() == t.dimen())
125 {
126 StackProduct<FGSTensor> sp(cont, it, out.getSym());
127 if (!sp.isZero(percoor))
128 {
129 KronProdStack<FGSTensor> kp(sp, percoor);
130 kp.optimizeOrder();
131 const Permutation &oper = kp.getPer();
132 if (Permutation(oper, per) == iden)
133 {
134 FPSTensor fps(out.getDims(), it, slice, kp);
135 {
136 std::unique_lock<std::mutex> lk{mut};
137 fps.addTo(out);
138 }
139 }
140 }
141 }
142 }
143 }
144 }
145
WorkerFoldMAASparse1(const FoldedStackContainer & container,const FSSparseTensor & ten,FGSTensor & outten,IntSequence c)146 WorkerFoldMAASparse1::WorkerFoldMAASparse1(const FoldedStackContainer &container,
147 const FSSparseTensor &ten,
148 FGSTensor &outten, IntSequence c)
149 : cont(container), t(ten), out(outten), coor(std::move(c))
150 {
151 }
152
153 /* Here is the second implementation of sparse folded multAndAdd(). It
154 is pretty similar to implementation of
155 UnfoldedStackContainer::multAndAddSparse2() code. We make a
156 dense folded slice, and then call folded multAndAddStacks(), which
157 multiplies all the combinations compatible with the slice. */
158
159 void
multAndAddSparse2(const FSSparseTensor & t,FGSTensor & out) const160 FoldedStackContainer::multAndAddSparse2(const FSSparseTensor &t,
161 FGSTensor &out) const
162 {
163 sthread::detach_thread_group gr;
164 FFSTensor dummy_f(0, numStacks(), t.dimen());
165 for (Tensor::index fi = dummy_f.begin(); fi != dummy_f.end(); ++fi)
166 gr.insert(std::make_unique<WorkerFoldMAASparse2>(*this, t, out, fi.getCoor()));
167
168 gr.run();
169 }
170
171 /* Here we make a sparse slice first and then call multAndAddStacks()
172 if the slice is not empty. If the slice is really sparse, we call
173 sparse version of multAndAddStacks(). What means “really sparse” is
174 given by ‘fill_threshold’. It is not tuned yet, a practice shows that
175 it must be a really low number, since sparse multAndAddStacks() is
176 much slower than the dense version.
177
178 Further, we take only nonzero rows of the slice, and accordingly of
179 the out tensor. We jump over zero initial rows and drop zero tailing
180 rows. */
181
182 void
operator ()(std::mutex & mut)183 WorkerFoldMAASparse2::operator()(std::mutex &mut)
184 {
185 GSSparseTensor slice(t, cont.getStackSizes(), coor,
186 TensorDimens(cont.getStackSizes(), coor));
187 if (slice.getNumNonZero())
188 {
189 if (slice.getUnfoldIndexFillFactor() > FoldedStackContainer::fill_threshold)
190 {
191 FGSTensor dense_slice(slice);
192 int r1 = slice.getFirstNonZeroRow();
193 int r2 = slice.getLastNonZeroRow();
194 FGSTensor dense_slice1(r1, r2-r1+1, dense_slice);
195 FGSTensor out1(r1, r2-r1+1, out);
196 cont.multAndAddStacks(coor, dense_slice1, out1, mut);
197 }
198 else
199 cont.multAndAddStacks(coor, slice, out, mut);
200 }
201 }
202
WorkerFoldMAASparse2(const FoldedStackContainer & container,const FSSparseTensor & ten,FGSTensor & outten,IntSequence c)203 WorkerFoldMAASparse2::WorkerFoldMAASparse2(const FoldedStackContainer &container,
204 const FSSparseTensor &ten,
205 FGSTensor &outten, IntSequence c)
206 : cont(container), t(ten), out(outten), coor(std::move(c))
207 {
208 }
209
210 /* Here is the third implementation of the sparse folded
211 multAndAdd(). It is column-wise implementation, and thus is not a good
212 candidate for the best performer.
213
214 We go through all columns from the output. For each column we
215 calculate folded ‘sumcol’ which is a sum of all appropriate columns
216 for all suitable equivalences. So we go through all suitable
217 equivalences, for each we construct a StackProduct object and
218 construct IrregTensor for a corresponding column of z. The
219 IrregTensor is an abstraction for Kronecker multiplication of
220 stacked columns of the two containers without zeros. Then the column
221 is added to ‘sumcol’. Finally, the ‘sumcol’ is multiplied by the
222 sparse tensor. */
223
224 void
multAndAddSparse3(const FSSparseTensor & t,FGSTensor & out) const225 FoldedStackContainer::multAndAddSparse3(const FSSparseTensor &t,
226 FGSTensor &out) const
227 {
228 const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
229 for (Tensor::index run = out.begin(); run != out.end(); ++run)
230 {
231 Vector outcol{out.getCol(*run)};
232 FRSingleTensor sumcol(t.nvar(), t.dimen());
233 sumcol.zeros();
234 for (const auto &it : eset)
235 if (it.numClasses() == t.dimen())
236 {
237 StackProduct<FGSTensor> sp(*this, it, out.getSym());
238 IrregTensorHeader header(sp, run.getCoor());
239 IrregTensor irten(header);
240 irten.addTo(sumcol);
241 }
242 t.multColumnAndAdd(sumcol, outcol);
243 }
244 }
245
246 /* Here is the fourth implementation of sparse
247 FoldedStackContainer::multAndAdd(). It is almost equivalent to
248 multAndAddSparse2() with the exception that the FPSTensor as a
249 result of a product of a slice and Kronecker product of the stack
250 derivatives is calculated in the sparse fashion. For further details, see
251 FoldedStackContainer::multAndAddStacks() sparse code and
252 FPSTensor| sparse constructor. */
253
254 void
multAndAddSparse4(const FSSparseTensor & t,FGSTensor & out) const255 FoldedStackContainer::multAndAddSparse4(const FSSparseTensor &t, FGSTensor &out) const
256 {
257 sthread::detach_thread_group gr;
258 FFSTensor dummy_f(0, numStacks(), t.dimen());
259 for (Tensor::index fi = dummy_f.begin(); fi != dummy_f.end(); ++fi)
260 gr.insert(std::make_unique<WorkerFoldMAASparse4>(*this, t, out, fi.getCoor()));
261
262 gr.run();
263 }
264
265 /* The WorkerFoldMAASparse4 is the same as WorkerFoldMAASparse2
266 with the exception that we call a sparse version of
267 multAndAddStacks(). */
268
269 void
operator ()(std::mutex & mut)270 WorkerFoldMAASparse4::operator()(std::mutex &mut)
271 {
272 GSSparseTensor slice(t, cont.getStackSizes(), coor,
273 TensorDimens(cont.getStackSizes(), coor));
274 if (slice.getNumNonZero())
275 cont.multAndAddStacks(coor, slice, out, mut);
276 }
277
WorkerFoldMAASparse4(const FoldedStackContainer & container,const FSSparseTensor & ten,FGSTensor & outten,IntSequence c)278 WorkerFoldMAASparse4::WorkerFoldMAASparse4(const FoldedStackContainer &container,
279 const FSSparseTensor &ten,
280 FGSTensor &outten, IntSequence c)
281 : cont(container), t(ten), out(outten), coor(std::move(c))
282 {
283 }
284
285 // FoldedStackContainer::multAndAddStacks() dense code
286 /* This is almost the same as UnfoldedStackContainer::multAndAddStacks() code.
287 The only difference is that we do not construct a UPSTensor from
288 KronProdStack, but we construct partially folded permuted symmetry
289 FPSTensor. Note that the tensor ‘g’ must be unfolded in order to be able to
290 multiply with unfolded rows of Kronecker product. However, columns of such a
291 product are partially folded giving a rise to the FPSTensor. */
292 void
multAndAddStacks(const IntSequence & coor,const FGSTensor & g,FGSTensor & out,std::mutex & mut) const293 FoldedStackContainer::multAndAddStacks(const IntSequence &coor,
294 const FGSTensor &g,
295 FGSTensor &out, std::mutex &mut) const
296 {
297 const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
298
299 UGSTensor ug(g);
300 UFSTensor dummy_u(0, numStacks(), g.dimen());
301 for (Tensor::index ui = dummy_u.begin(); ui != dummy_u.end(); ++ui)
302 {
303 IntSequence tmp(ui.getCoor());
304 tmp.sort();
305 if (tmp == coor)
306 {
307 Permutation sort_per(ui.getCoor());
308 sort_per.inverse();
309 for (const auto &it : eset)
310 if (it.numClasses() == g.dimen())
311 {
312 StackProduct<FGSTensor> sp(*this, it, sort_per, out.getSym());
313 if (!sp.isZero(coor))
314 {
315 KronProdStack<FGSTensor> kp(sp, coor);
316 if (ug.getSym().isFull())
317 kp.optimizeOrder();
318 FPSTensor fps(out.getDims(), it, sort_per, ug, kp);
319 {
320 std::unique_lock<std::mutex> lk{mut};
321 fps.addTo(out);
322 }
323 }
324 }
325 }
326 }
327 }
328
329 // FoldedStackContainer::multAndAddStacks() sparse code
330 /* This is almost the same as FoldedStackContainer::multAndAddStacks() dense code. The only
331 difference is that the Kronecker product of the stacks is multiplied
332 with sparse slice GSSparseTensor (not dense slice FGSTensor). The
333 multiplication is done in FPSTensor sparse constructor. */
334 void
multAndAddStacks(const IntSequence & coor,const GSSparseTensor & g,FGSTensor & out,std::mutex & mut) const335 FoldedStackContainer::multAndAddStacks(const IntSequence &coor,
336 const GSSparseTensor &g,
337 FGSTensor &out, std::mutex &mut) const
338 {
339 const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
340 UFSTensor dummy_u(0, numStacks(), g.dimen());
341 for (Tensor::index ui = dummy_u.begin(); ui != dummy_u.end(); ++ui)
342 {
343 IntSequence tmp(ui.getCoor());
344 tmp.sort();
345 if (tmp == coor)
346 {
347 Permutation sort_per(ui.getCoor());
348 sort_per.inverse();
349 for (const auto &it : eset)
350 if (it.numClasses() == g.dimen())
351 {
352 StackProduct<FGSTensor> sp(*this, it, sort_per, out.getSym());
353 if (!sp.isZero(coor))
354 {
355 KronProdStack<FGSTensor> kp(sp, coor);
356 FPSTensor fps(out.getDims(), it, sort_per, g, kp);
357 {
358 std::unique_lock<std::mutex> lk{mut};
359 fps.addTo(out);
360 }
361 }
362 }
363 }
364 }
365 }
366
367 // UnfoldedStackContainer::multAndAdd() sparse code
368 /* Here we simply call either multAndAddSparse1() or
369 multAndAddSparse2(). The first one allows for optimization of
370 Kronecker products, so it seems to be more efficient. */
371 void
multAndAdd(const FSSparseTensor & t,UGSTensor & out) const372 UnfoldedStackContainer::multAndAdd(const FSSparseTensor &t,
373 UGSTensor &out) const
374 {
375 TL_RAISE_IF(t.nvar() != getAllSize(),
376 "Wrong number of variables of tensor for UnfoldedStackContainer::multAndAdd");
377 multAndAddSparse2(t, out);
378 }
379
380 // UnfoldedStackContainer::multAndAdd() dense code
381 /* Here we implement the formula for stacks for fully symmetric tensor
382 scattered in a number of general symmetry tensors contained in a given
383 container. The implementations is pretty the same as in
384 multAndAddSparse2() but we do not do the slices of sparse tensor, but
385 only a lookup to the container.
386
387 This means that we do not iterate through a dummy folded tensor to
388 obtain folded coordinates of stacks, rather we iterate through all
389 symmetries contained in the container and the coordinates of stacks
390 are obtained as unfolded identity sequence via the symmetry. The
391 reason of doing this is that we are unable to calculate symmetry from
392 stack coordinates as easily as stack coordinates from the symmetry. */
393 void
multAndAdd(int dim,const UGSContainer & c,UGSTensor & out) const394 UnfoldedStackContainer::multAndAdd(int dim, const UGSContainer &c,
395 UGSTensor &out) const
396 {
397 TL_RAISE_IF(c.num() != numStacks(),
398 "Wrong symmetry length of container for UnfoldedStackContainer::multAndAdd");
399
400 sthread::detach_thread_group gr;
401 for (auto &si : SymmetrySet(dim, c.num()))
402 if (c.check(si))
403 gr.insert(std::make_unique<WorkerUnfoldMAADense>(*this, si, c, out));
404
405 gr.run();
406 }
407
408 void
operator ()(std::mutex & mut)409 WorkerUnfoldMAADense::operator()(std::mutex &mut)
410 {
411 Permutation iden(dense_cont.num());
412 IntSequence coor(iden.getMap().unfold(sym));
413 const UGSTensor &g = dense_cont.get(sym);
414 cont.multAndAddStacks(coor, g, out, mut);
415 }
416
WorkerUnfoldMAADense(const UnfoldedStackContainer & container,Symmetry s,const UGSContainer & dcontainer,UGSTensor & outten)417 WorkerUnfoldMAADense::WorkerUnfoldMAADense(const UnfoldedStackContainer &container,
418 Symmetry s,
419 const UGSContainer &dcontainer,
420 UGSTensor &outten)
421 : cont(container), sym(std::move(s)), dense_cont(dcontainer), out(outten)
422 {
423 }
424
425 /* Here we implement the formula for unfolded tensors. If, for instance,
426 a coordinate z of a tensor [f_z²] is partitioned as
427 z=(a, b), then we perform the following:
428
429 ⎛a_c(x)⎞ ⎛a_c(y)⎞
430 [f_z²] · ∑ ⎢ ⎥⊗⎢ ⎥ = [f_aa] · ∑ a_c(x)⊗a_c(y) + [f_ab] · ∑ a_c(x)⊗b_c(y)
431 ᶜ ⎝b_c(x)⎠ ⎝b_c(y)⎠ ᶜ ᶜ
432
433 + [f_ba] · ∑ b_c(x)⊗a_c(y) + [f_bb] · ∑ b_c(x)⊗b_c(y)
434
435 This is exactly what happens here. The code is clear. It goes through
436 all combinations of stacks, and each thread is responsible for
437 operation for the slice corresponding to the combination of the stacks. */
438
439 void
multAndAddSparse1(const FSSparseTensor & t,UGSTensor & out) const440 UnfoldedStackContainer::multAndAddSparse1(const FSSparseTensor &t,
441 UGSTensor &out) const
442 {
443 sthread::detach_thread_group gr;
444 UFSTensor dummy(0, numStacks(), t.dimen());
445 for (Tensor::index ui = dummy.begin(); ui != dummy.end(); ++ui)
446 gr.insert(std::make_unique<WorkerUnfoldMAASparse1>(*this, t, out, ui.getCoor()));
447
448 gr.run();
449 }
450
451 /* This does a step of UnfoldedStackContainer::multAndAddSparse1() for
452 a given coordinates. First it makes the slice of the given stack coordinates.
453 Then it multiplies everything what should be multiplied with the slice.
454 That is it goes through all equivalences, creates StackProduct, then
455 KronProdStack, which is added to ‘out’. So far everything is clear.
456
457 However, we want to use optimized KronProdAllOptim to minimize a number of
458 flops and memory needed in the Kronecker product. So we go through all
459 permutations ‘per’, permute the coordinates to get ‘percoor’, go through all
460 equivalences, and make KronProdStack and optimize it. The result of
461 optimization is a permutation ‘oper’. Now, we multiply the Kronecker product
462 with the slice, only if the slice has the same ordering of coordinates as
463 the Kronecker product KronProdStack. However, it is not perfectly true.
464 Since we go through *all* permutations ‘per’, there might be two different
465 permutations leading to the same ordering in KronProdStack and thus the same
466 ordering in the optimized KronProdStack. The two cases would be counted
467 twice, which is wrong. That is why we do not condition on
468 coor∘oper∘per = coor, but we condition on oper∘per = id. In this way, we
469 rule out permutations ‘per’ leading to the same ordering of stacks when
470 applied on ‘coor’.
471
472 TODO: vertically narrow slice and out according to the fill in t. */
473
474 void
operator ()(std::mutex & mut)475 WorkerUnfoldMAASparse1::operator()(std::mutex &mut)
476 {
477 const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
478 const PermutationSet &pset = TLStatic::getPerm(t.dimen());
479 Permutation iden(t.dimen());
480
481 UPSTensor slice(t, cont.getStackSizes(), coor,
482 PerTensorDimens(cont.getStackSizes(), coor));
483 for (int iper = 0; iper < pset.getNum(); iper++)
484 {
485 const Permutation &per = pset.get(iper);
486 IntSequence percoor(coor.size());
487 per.apply(coor, percoor);
488 for (const auto &it : eset)
489 if (it.numClasses() == t.dimen())
490 {
491 StackProduct<UGSTensor> sp(cont, it, out.getSym());
492 if (!sp.isZero(percoor))
493 {
494 KronProdStack<UGSTensor> kp(sp, percoor);
495 kp.optimizeOrder();
496 const Permutation &oper = kp.getPer();
497 if (Permutation(oper, per) == iden)
498 {
499 UPSTensor ups(out.getDims(), it, slice, kp);
500 {
501 std::unique_lock<std::mutex> lk{mut};
502 ups.addTo(out);
503 }
504 }
505 }
506 }
507 }
508 }
509
WorkerUnfoldMAASparse1(const UnfoldedStackContainer & container,const FSSparseTensor & ten,UGSTensor & outten,IntSequence c)510 WorkerUnfoldMAASparse1::WorkerUnfoldMAASparse1(const UnfoldedStackContainer &container,
511 const FSSparseTensor &ten,
512 UGSTensor &outten, IntSequence c)
513 : cont(container), t(ten), out(outten), coor(std::move(c))
514 {
515 }
516
517 /* In here we implement the formula by a bit different way. We use the
518 fact, using notation of UnfoldedStackContainer::multAndAddSparse2(),
519 that
520 ⎛ ⎞
521 [f_ba] · ∑ b_c(x)⊗a_c(y) = [f_ba] ·⎢∑ a_c(y)⊗b_c(x)⎥· P
522 ᶜ ⎝ᶜ ⎠
523
524 where P is a suitable permutation of columns. The permutation
525 corresponds to (in this example) a swap of a and b. An advantage
526 of this approach is that we do not need UPSTensor for [f_ba], and
527 thus we decrease the number of needed slices.
528
529 So we go through all folded indices of stack coordinates, then for
530 each such index ‘fi’ we make a slice and call multAndAddStacks(). This
531 goes through all corresponding unfolded indices to perform the
532 formula. Each unsorted (unfold) index implies a sorting permutation
533 ‘sort_per’ which must be used to permute stacks in StackProduct, and
534 permute equivalence classes when UPSTensor is formed. In this way
535 the column permutation P from the formula is factored to the
536 permutation of UPSTensor. */
537
538 void
multAndAddSparse2(const FSSparseTensor & t,UGSTensor & out) const539 UnfoldedStackContainer::multAndAddSparse2(const FSSparseTensor &t,
540 UGSTensor &out) const
541 {
542 sthread::detach_thread_group gr;
543 FFSTensor dummy_f(0, numStacks(), t.dimen());
544 for (Tensor::index fi = dummy_f.begin(); fi != dummy_f.end(); ++fi)
545 gr.insert(std::make_unique<WorkerUnfoldMAASparse2>(*this, t, out, fi.getCoor()));
546
547 gr.run();
548 }
549
550 /* This does a step of UnfoldedStackContainer::multAndAddSparse2() for a given
551 coordinates.
552
553 TODO: implement multAndAddStacks() for sparse slice as
554 FoldedStackContainer::multAndAddStacks() sparse code and do this method as
555 WorkerFoldMAASparse2::operator()(). */
556
557 void
operator ()(std::mutex & mut)558 WorkerUnfoldMAASparse2::operator()(std::mutex &mut)
559 {
560 GSSparseTensor slice(t, cont.getStackSizes(), coor,
561 TensorDimens(cont.getStackSizes(), coor));
562 if (slice.getNumNonZero())
563 {
564 FGSTensor fslice(slice);
565 UGSTensor dense_slice(fslice);
566 int r1 = slice.getFirstNonZeroRow();
567 int r2 = slice.getLastNonZeroRow();
568 UGSTensor dense_slice1(r1, r2-r1+1, dense_slice);
569 UGSTensor out1(r1, r2-r1+1, out);
570
571 cont.multAndAddStacks(coor, dense_slice1, out1, mut);
572 }
573 }
574
WorkerUnfoldMAASparse2(const UnfoldedStackContainer & container,const FSSparseTensor & ten,UGSTensor & outten,IntSequence c)575 WorkerUnfoldMAASparse2::WorkerUnfoldMAASparse2(const UnfoldedStackContainer &container,
576 const FSSparseTensor &ten,
577 UGSTensor &outten, IntSequence c)
578 : cont(container), t(ten), out(outten), coor(std::move(c))
579 {
580 }
581
582 /* For a given unfolded coordinates of stacks ‘fi’, and appropriate
583 tensor g, whose symmetry is a symmetry of ‘fi’, the method
584 contributes to ‘out’ all tensors in unfolded stack formula involving
585 stacks chosen by ‘fi’.
586
587 We go through all ‘ui’ coordinates which yield ‘fi’ after sorting. We
588 construct a permutation ‘sort_per’ which sorts ‘ui’ to ‘fi’. We go
589 through all appropriate equivalences, and construct StackProduct
590 from equivalence classes permuted by ‘sort_per’, then UPSTensor with
591 implied permutation of columns by the permuted equivalence by
592 ‘sort_per’. The UPSTensor is then added to ‘out’.
593
594 We cannot use here the optimized KronProdStack, since the symmetry
595 of ‘UGSTensor& g’ prescribes the ordering of the stacks. However, if
596 ‘g’ is fully symmetric, we can do the optimization harmlessly. */
597
598 void
multAndAddStacks(const IntSequence & fi,const UGSTensor & g,UGSTensor & out,std::mutex & mut) const599 UnfoldedStackContainer::multAndAddStacks(const IntSequence &fi,
600 const UGSTensor &g,
601 UGSTensor &out, std::mutex &mut) const
602 {
603 const EquivalenceSet &eset = TLStatic::getEquiv(out.dimen());
604
605 UFSTensor dummy_u(0, numStacks(), g.dimen());
606 for (Tensor::index ui = dummy_u.begin(); ui != dummy_u.end(); ++ui)
607 {
608 IntSequence tmp(ui.getCoor());
609 tmp.sort();
610 if (tmp == fi)
611 {
612 Permutation sort_per(ui.getCoor());
613 sort_per.inverse();
614 for (const auto &it : eset)
615 if (it.numClasses() == g.dimen())
616 {
617 StackProduct<UGSTensor> sp(*this, it, sort_per, out.getSym());
618 if (!sp.isZero(fi))
619 {
620 KronProdStack<UGSTensor> kp(sp, fi);
621 if (g.getSym().isFull())
622 kp.optimizeOrder();
623 UPSTensor ups(out.getDims(), it, sort_per, g, kp);
624 {
625 std::unique_lock<std::mutex> lk{mut};
626 ups.addTo(out);
627 }
628 }
629 }
630 }
631 }
632 }
633