1 /*
2 * Copyright © 2005 Ondra Kamenik
3 * Copyright © 2019 Dynare Team
4 *
5 * This file is part of Dynare.
6 *
7 * Dynare is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * Dynare is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with Dynare. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "faa_di_bruno.hh"
22 #include "fine_container.hh"
23
24 #include <cmath>
25
26 // FaaDiBruno::calculate() folded sparse code
27 /* We take an opportunity to refine the stack container to avoid allocation of
28 more memory than available. */
29 void
calculate(const StackContainer<FGSTensor> & cont,const TensorContainer<FSSparseTensor> & f,FGSTensor & out)30 FaaDiBruno::calculate(const StackContainer<FGSTensor> &cont,
31 const TensorContainer<FSSparseTensor> &f,
32 FGSTensor &out)
33 {
34 out.zeros();
35 for (int l = 1; l <= out.dimen(); l++)
36 {
37 auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
38 FoldedFineContainer fine_cont(cont, max);
39 fine_cont.multAndAdd(l, f, out);
40 JournalRecord recc(journal);
41 recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
42 << " stacks=" << cont.numStacks() << u8"→" << fine_cont.numStacks() << endrec;
43 }
44 }
45
46 // FaaDiBruno::calculate() folded dense code
47 /* Here we just simply evaluate multAndAdd() for the dense container. There is
48 no opportunity for tuning. */
49 void
calculate(const FoldedStackContainer & cont,const FGSContainer & g,FGSTensor & out)50 FaaDiBruno::calculate(const FoldedStackContainer &cont, const FGSContainer &g,
51 FGSTensor &out)
52 {
53 out.zeros();
54 for (int l = 1; l <= out.dimen(); l++)
55 {
56 long int mem = SystemResources::availableMemory();
57 cont.multAndAdd(l, g, out);
58 JournalRecord rec(journal);
59 int mem_mb = mem/1024/1024;
60 rec << "dim=" << l << " avmem=" << mem_mb << endrec;
61 }
62 }
63
64 // FaaDiBruno::calculate() unfolded sparse code
65 /* This is the same as FaaDiBruno::calculate() folded sparse code. The only
66 difference is that we construct unfolded fine container. */
67 void
calculate(const StackContainer<UGSTensor> & cont,const TensorContainer<FSSparseTensor> & f,UGSTensor & out)68 FaaDiBruno::calculate(const StackContainer<UGSTensor> &cont,
69 const TensorContainer<FSSparseTensor> &f,
70 UGSTensor &out)
71 {
72 out.zeros();
73 for (int l = 1; l <= out.dimen(); l++)
74 {
75 auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
76 UnfoldedFineContainer fine_cont(cont, max);
77 fine_cont.multAndAdd(l, f, out);
78 JournalRecord recc(journal);
79 recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
80 << " stacks=" << cont.numStacks() << u8"→" << fine_cont.numStacks() << endrec;
81 }
82 }
83
84 // FaaDiBruno::calculate() unfolded dense code
85 /* Again, no tuning opportunity here. */
86 void
calculate(const UnfoldedStackContainer & cont,const UGSContainer & g,UGSTensor & out)87 FaaDiBruno::calculate(const UnfoldedStackContainer &cont, const UGSContainer &g,
88 UGSTensor &out)
89 {
90 out.zeros();
91 for (int l = 1; l <= out.dimen(); l++)
92 {
93 long int mem = SystemResources::availableMemory();
94 cont.multAndAdd(l, g, out);
95 JournalRecord rec(journal);
96 int mem_mb = mem/1024/1024;
97 rec << "dim=" << l << " avmem=" << mem_mb << endrec;
98 }
99 }
100
101 /* This function returns a number of maximum rows used for refinement of the
102 stacked container. We want to set the maximum so that the expected memory
103 consumption for the number of paralel threads would be less than available
104 memory. On the other hand we do not want to be too pesimistic since a very
105 fine refinement can be very slow.
106
107 Besides memory needed for a dense unfolded slice of a tensor from ‘f’, each
108 thread needs ‘magic_mult*per_size’ bytes of memory. In the worst case,
109 ‘magic_mult’ will be equal to 2, this means memory ‘per_size’ for target
110 temporary (permuted symmetry) tensor plus one copy for intermediate result.
111 However, this shows to be too pesimistic, so we set ‘magic_mult’ to 1.5. The
112 memory for permuted symmetry temporary tensor ‘per_size’ is estimated as a
113 weigthed average of unfolded memory of the ‘out’ tensor and unfolded memory
114 of a symetric tensor with the largest coordinate size. Some experiments
115 showed that the best combination of the two is to take 100% if the latter,
116 so we set ‘lambda’ to zero.
117
118 The ‘max’ number of rows in the refined ‘cont’ must be such that each
119 slice fits to remaining memory. Number of columns of the slice are
120 never greater maxˡ. (This is not true, since stacks corresponding to
121 unit/zero matrices cannot be further refined). We get an equation:
122
123 nthreads·maxˡ·8·r = mem − magic_mult·nthreads·per_size·8·r
124
125 where ‘mem’ is available memory in bytes, ‘nthreads’ is a number of threads,
126 r is a number of rows, and 8 is ‘sizeof(double)’.
127
128 If the right hand side is less than zero, we set ‘max’ to 10, just to let it
129 do something. */
130
131 std::tuple<int, int, int>
estimRefinement(const TensorDimens & tdims,int nr,int l)132 FaaDiBruno::estimRefinement(const TensorDimens &tdims, int nr, int l)
133 {
134 int nthreads = sthread::detach_thread_group::max_parallel_threads;
135 long per_size1 = tdims.calcUnfoldMaxOffset();
136 long per_size2 = static_cast<long>(std::pow(tdims.getNVS().getMax(), l));
137 double lambda = 0.0;
138 long per_size = sizeof(double)*nr
139 *static_cast<long>(lambda*per_size1+(1-lambda)*per_size2);
140 long mem = SystemResources::availableMemory();
141 int max = 0;
142 double num_cols = static_cast<double>(mem-magic_mult*nthreads*per_size)
143 /nthreads/sizeof(double)/nr;
144 if (num_cols > 0)
145 {
146 double maxd = std::pow(num_cols, 1.0/l);
147 max = static_cast<int>(std::floor(maxd));
148 }
149 if (max == 0)
150 {
151 max = 10;
152 JournalRecord rec(journal);
153 rec << "dim=" << l << " run out of memory, imposing max=" << max;
154 if (nthreads > 1)
155 rec << " (decrease number of threads)";
156 rec << endrec;
157 }
158 int avmem_mb = mem/1024/1024;
159 int tmpmem_mb = nthreads*per_size/1024/1024;
160 return { max, avmem_mb, tmpmem_mb };
161 }
162