1 /*
2  * Copyright © 2005 Ondra Kamenik
3  * Copyright © 2019 Dynare Team
4  *
5  * This file is part of Dynare.
6  *
7  * Dynare is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Dynare is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "faa_di_bruno.hh"
22 #include "fine_container.hh"
23 
24 #include <cmath>
25 
26 // FaaDiBruno::calculate() folded sparse code
27 /* We take an opportunity to refine the stack container to avoid allocation of
28    more memory than available. */
29 void
calculate(const StackContainer<FGSTensor> & cont,const TensorContainer<FSSparseTensor> & f,FGSTensor & out)30 FaaDiBruno::calculate(const StackContainer<FGSTensor> &cont,
31                       const TensorContainer<FSSparseTensor> &f,
32                       FGSTensor &out)
33 {
34   out.zeros();
35   for (int l = 1; l <= out.dimen(); l++)
36     {
37       auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
38       FoldedFineContainer fine_cont(cont, max);
39       fine_cont.multAndAdd(l, f, out);
40       JournalRecord recc(journal);
41       recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
42            << " stacks=" << cont.numStacks() << u8"→" << fine_cont.numStacks() << endrec;
43     }
44 }
45 
46 // FaaDiBruno::calculate() folded dense code
47 /* Here we just simply evaluate multAndAdd() for the dense container. There is
48    no opportunity for tuning. */
49 void
calculate(const FoldedStackContainer & cont,const FGSContainer & g,FGSTensor & out)50 FaaDiBruno::calculate(const FoldedStackContainer &cont, const FGSContainer &g,
51                       FGSTensor &out)
52 {
53   out.zeros();
54   for (int l = 1; l <= out.dimen(); l++)
55     {
56       long int mem = SystemResources::availableMemory();
57       cont.multAndAdd(l, g, out);
58       JournalRecord rec(journal);
59       int mem_mb = mem/1024/1024;
60       rec << "dim=" << l << " avmem=" << mem_mb << endrec;
61     }
62 }
63 
64 // FaaDiBruno::calculate() unfolded sparse code
65 /* This is the same as FaaDiBruno::calculate() folded sparse code. The only
66    difference is that we construct unfolded fine container. */
67 void
calculate(const StackContainer<UGSTensor> & cont,const TensorContainer<FSSparseTensor> & f,UGSTensor & out)68 FaaDiBruno::calculate(const StackContainer<UGSTensor> &cont,
69                       const TensorContainer<FSSparseTensor> &f,
70                       UGSTensor &out)
71 {
72   out.zeros();
73   for (int l = 1; l <= out.dimen(); l++)
74     {
75       auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
76       UnfoldedFineContainer fine_cont(cont, max);
77       fine_cont.multAndAdd(l, f, out);
78       JournalRecord recc(journal);
79       recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
80            << " stacks=" << cont.numStacks() << u8"→" << fine_cont.numStacks() << endrec;
81     }
82 }
83 
84 // FaaDiBruno::calculate() unfolded dense code
85 /* Again, no tuning opportunity here. */
86 void
calculate(const UnfoldedStackContainer & cont,const UGSContainer & g,UGSTensor & out)87 FaaDiBruno::calculate(const UnfoldedStackContainer &cont, const UGSContainer &g,
88                       UGSTensor &out)
89 {
90   out.zeros();
91   for (int l = 1; l <= out.dimen(); l++)
92     {
93       long int mem = SystemResources::availableMemory();
94       cont.multAndAdd(l, g, out);
95       JournalRecord rec(journal);
96       int mem_mb = mem/1024/1024;
97       rec << "dim=" << l << " avmem=" << mem_mb << endrec;
98     }
99 }
100 
101 /* This function returns a number of maximum rows used for refinement of the
102    stacked container. We want to set the maximum so that the expected memory
103    consumption for the number of paralel threads would be less than available
104    memory. On the other hand we do not want to be too pesimistic since a very
105    fine refinement can be very slow.
106 
107    Besides memory needed for a dense unfolded slice of a tensor from ‘f’, each
108    thread needs ‘magic_mult*per_size’ bytes of memory. In the worst case,
109    ‘magic_mult’ will be equal to 2, this means memory ‘per_size’ for target
110    temporary (permuted symmetry) tensor plus one copy for intermediate result.
111    However, this shows to be too pesimistic, so we set ‘magic_mult’ to 1.5. The
112    memory for permuted symmetry temporary tensor ‘per_size’ is estimated as a
113    weigthed average of unfolded memory of the ‘out’ tensor and unfolded memory
114    of a symetric tensor with the largest coordinate size. Some experiments
115    showed that the best combination of the two is to take 100% if the latter,
116    so we set ‘lambda’ to zero.
117 
118    The ‘max’ number of rows in the refined ‘cont’ must be such that each
119    slice fits to remaining memory. Number of columns of the slice are
120    never greater maxˡ. (This is not true, since stacks corresponding to
121    unit/zero matrices cannot be further refined). We get an equation:
122 
123     nthreads·maxˡ·8·r = mem − magic_mult·nthreads·per_size·8·r
124 
125    where ‘mem’ is available memory in bytes, ‘nthreads’ is a number of threads,
126    r is a number of rows, and 8 is ‘sizeof(double)’.
127 
128    If the right hand side is less than zero, we set ‘max’ to 10, just to let it
129    do something. */
130 
131 std::tuple<int, int, int>
estimRefinement(const TensorDimens & tdims,int nr,int l)132 FaaDiBruno::estimRefinement(const TensorDimens &tdims, int nr, int l)
133 {
134   int nthreads = sthread::detach_thread_group::max_parallel_threads;
135   long per_size1 = tdims.calcUnfoldMaxOffset();
136   long per_size2 = static_cast<long>(std::pow(tdims.getNVS().getMax(), l));
137   double lambda = 0.0;
138   long per_size = sizeof(double)*nr
139     *static_cast<long>(lambda*per_size1+(1-lambda)*per_size2);
140   long mem = SystemResources::availableMemory();
141   int max = 0;
142   double num_cols = static_cast<double>(mem-magic_mult*nthreads*per_size)
143     /nthreads/sizeof(double)/nr;
144   if (num_cols > 0)
145     {
146       double maxd = std::pow(num_cols, 1.0/l);
147       max = static_cast<int>(std::floor(maxd));
148     }
149   if (max == 0)
150     {
151       max = 10;
152       JournalRecord rec(journal);
153       rec << "dim=" << l << " run out of memory, imposing max=" << max;
154       if (nthreads > 1)
155         rec << " (decrease number of threads)";
156       rec << endrec;
157     }
158   int avmem_mb = mem/1024/1024;
159   int tmpmem_mb = nthreads*per_size/1024/1024;
160   return { max, avmem_mb, tmpmem_mb };
161 }
162