1 /* Manage the in-memory data for the matrix */
2 
3 /* It's in C++ because the STL is handy, but that's really all there is
4  * to it ; a conversion to C would not be extremely difficult */
5 
6 #include "cado.h" // IWYU pragma: keep
7 // IWYU pragma: no_include <memory>
8 #include <cinttypes>
9 #include <climits>
10 #include <cmath>
11 #include <cstdarg>         // for va_list, va_end, va_start
12 #include <cstddef>      // size_t, NULL
13 #include <cstdint>
14 #include <cstdio>
15 #include <cstdlib>
16 #include <cstring>
17 // C++ headers.
18 #include <algorithm>    // sort
19 #include <deque>
20 #include <limits>       // underlying_type_{min,max}
21 #include <list>
22 #include <sstream> // ostringstream // IWYU pragma: keep
23 #include <string>           // for basic_string
24 #include <type_traits>      // for __strip_reference_wrapper<>::__type
25 #include <utility>          // for pair, make_pair, swap
26 #include <vector>
27 
28 #include "fmt/core.h"       // for check_format_string
29 #include "fmt/format.h"     // for basic_buffer::append, basic_parse_context...
30 #include "fmt/printf.h"     // fmt::fprintf // IWYU pragma: keep
31 #include "matmul.h"         // for matmul_ptr, matmul_public_s, MATMUL_AUX_Z...
32 #include "macros.h"
33 #include "verbose.h"    // CADO_VERBOSE_PRINT_BWC_CACHE_BUILD
34 #include "timing.h"     // wct_seconds
35 #include "mpfq_layer.h"
36 
37 using namespace std;
38 
39 /* Make sure that the assembly function is only called if it matches
40  * correctly the abase header !! */
41 #if defined(HAVE_GAS_SYNTAX_ASSEMBLY_SOURCES) && defined(SELECT_MPFQ_LAYER_u64k1)
42 // disabling one particular assembly code is done by simply disabling the
43 // header file (and optionally removing the assembly file from the link
44 // list is the CMakeLists.txt file, but in reality it's not needed. This
45 // file being C++, the names are mangled if no header file marks them as
46 // having C linkage. So commenting out is enough).
47 #include "matmul-sub-small1.h" // IWYU pragma: keep
48 #include "matmul-sub-small2.h" // IWYU pragma: keep
49 #include "matmul-sub-large-fbd.h" // IWYU pragma: keep
50 #include "matmul-sub-large-fbi.h" // IWYU pragma: keep
51 #include "matmul-sub-vsc-dispatch.h" // IWYU pragma: keep
52 #include "matmul-sub-vsc-combine.h" // IWYU pragma: keep
53 // #define ENABLE_ASM
54 #endif
55 
56 #include "matmul-common.h"
57 
58 #include "matmul_facade.h"
59 #include "portability.h" // strdup // IWYU pragma: keep
60 #include "params.h"
61 
62 /* {{{ Documentation
63  *
64  * Parameters referred to by this documentation text may be tuned via
65  * #define's below.
66  *
67  * This implementation builds upon the ``sliced'' variant.
68  * The format used with the -sliced implementation is still used, but
69  * there is also another format used for very dense strips. We refer to
70  * matmul-sliced.cpp for the first format, which is called ``small1''
71  * here (and there also).
72  *
73  * The ``small2'' format is specialised for denser strips.  Within an
74  * horizontal strip of less than 4096 rows, when two non-zero columns are
75  * never more than 16 positions apart, then the information (dj,i) is
76  * stored in 16 bits instead of 32. This reduces the memory footprint
77  * quite considerably, since many coefficients are in dense strips.
78  * Speed-wise, small2 strips provide a nice speed-up sometimes (not
79  * always).
80  *
81  * When we reach sparser areas, we use another method. Dense (small2 and
82  * small1) slices having been discarded, the rest is split in ``large''
83  * slices of LSL_NBUCKETS_MAX*256 rows at most (in fact we arrange for the
84  * slices to have equal size, so it's a bit less).  For each such large
85  * slice, we use scratch space equal to the L2 cache size for performing
86  * matrix multiplication. While reading column vector data, we copy
87  * coefficient to (up to) LSL_NBUCKETS_MAX ``buckets'', each representing
88  * 256 possible row indices. The dispatching information, telling in
89  * which bucket a given coefficient has to be copied, is an 8-bit integer
90  * stored in a table called ``main''. Each entry in table ``main'' also
91  * has a second 8-bit integer indicating the offset to the next useful
92  * column entry (because we're considering very sparse blocks, it is not
93  * at all uncommon to see average values ~ 10 here, even when
94  * LSL_NBUCKETS_MAX==256, so that 65536 rows are considered
95  * simultaneously).
96  *
97  * The number of columns considered while filling the scratch space is
98  * limited by the scratch space size. Once it's filled, we ``apply the
99  * buckets'', so as to be able to reuse our scratch space buffer for
100  * another batch of columns. Thus a ``large slice'' is typically split
101  * into several ``vertical blocks''. Extremely sparse large slices will
102  * have only one vertical block.
103  *
104  * ``Applying the buckets'' means taking a list of column coefficients
105  * which have been directed to the bucket because it is known that they
106  * affect at least one of the 256 corresponding row indices on output. So
107  * in addition to the N coefficients stored in the bucket, we have a
108  * list of N 8-bit integers indicating which row index is to be modified.
109  *
110  * [the section below is superseded by the next one]
111  * That's it for the basic idea. Now on top of that, we use an extra
112  * indirection level for very sparse blocks. Several ``large blocks'' are
113  * considered together as part of a ``huge block''. Call this number
114  * ``nlarge''. Source coefficients are first dispatched in nlarge ``big
115  * buckets'' of appropriate size. The number of columns considered is
116  * guided by the fact that we want none of these big buckets to exceed
117  * the L2 cache size.  Afterwards, we do a second bispatching from each
118  * of these big buckets (in turn) to (up to) LSL_NBUCKETS_MAX small
119  * buckets in the same manner as above (except that we no longer need the
120  * information on the link to the next useful column.
121  *
122  * [documentation of the new format]
123  * After ``large'' blocks, we consider ``deferred'' blocks, also referred
124  * to as (vertical) staircase-shaped blocks. The idea is the following.
125  * The portion of the matrix under study is split into several vertical
126  * blocks of at most 2^16 entries. In the other direction, we make splits
127  * according to the average row density. A first batch of row has some
128  * indicative density X, the next one X/2, and so on (the ratio 2 being
129  * in fact VSC_BLOCKS_FLUSHFREQ_RATIO) until the sparsest block at the
130  * bottom.  Such horizontal blocks need not have the same size.
131  *
132  * Vertical strips are processed in order. Buffers corresponding to the
133  * sub-matrices in the different sub-matrices created by the horizontal
134  * splitting are filled. We thus populate scratch space with coefficients
135  * from the source vector. After each such processing of a vertical
136  * strip, we consider whether we have to perform flushing w.r.t. the
137  * horizontal strips. The buffers corresponding to the densest horizontal
138  * strip are flushed after each vstrip processing. Those which are twice
139  * sparser are flushed twice less often. Thus the next vstrip occupies
140  * some more buffer space. This pattern goes until the sparsest block,
141  * which is probably flushed only once. The total amount of buffer space
142  * needed is proportional to the number of coefficients in an area having
143  * a staircase shape. Assume we have N vstrips of width W. Assume
144  * horizontal blocks have a number of rows R0, R1, R2, ... corresponding
145  * to densities X, X/2, X/4, ...  (the ratio 2 here is in fact given by
146  * VSC_BLOCKS_FLUSHFREQ_RATIO). The formula
147  * for the tbuf space is:
148  *   max weight of the N matrices of size R0*W in h. strip #0
149  * + max weight of the N/2 matrices of size R1*2W in h. strip #1
150  * + max weight of the N/4 matrices of size R2*4W in h. strip #2
151  * ...
152  * It should be noted that the numbers added up in this formula are
153  * expected to be of the same magnitude asymptotically. For real-life
154  * examples though, it's pretty common to observe somewhat degenerate
155  * cases.
156  *
157  * }}} */
158 
159 
160 /* {{{ Documentation for parameters */
161 // take only a portion of the L1 cache.
162 // #define L1_CACHE_SIZE   192000
163 // for 8-bytes abt values, this gives 3500 items.
164 // note that allowing more than 4096 items here (or anything
165 // SMALL_SLICES_I_BITS dictates) effectively disables small2 slices,
166 // which pack (i,dj) values in 16 bits.
167 
168 // tunables for small2 format.
169 #define SMALL_SLICES_I_BITS     12
170 #define SMALL_SLICES_DJ_BITS    (16 - SMALL_SLICES_I_BITS)
171 
172 // make sure that it's something each core can use (i.e. divide by two
173 // the L2 cache size for a dual-core w/ shared L2).
174 // #define L2_CACHE_SIZE   1600000
175 
176 /* To what should we compare the average dj value for determining when we
177  * should switch from dense (small) to sparse (large) blocks */
178 // #define DJ_CUTOFF1   8.0
179 
180 /* Same, but for stopping large blocks and go to the next type
181  * (currently, deferred blocks). Note thatthe time for large blocks grows
182  * faster with multi-cores, probably because of the strong dependency on
183  * TLB availability.
184  */
185 // #define DJ_CUTOFF2   24.0
186 
187 /* How many buckets in large slices, and in large slices which are
188  * contained in huge slices. */
189 // #define LSL_NBUCKETS_MAX      256
190 
191 /* How many large slices MINIMUM must go in a huge slice ? Below this
192  * value, we keep going producing large slices. */
193 // note that this part of the code is currently inactive
194 #define HUGE_MPLEX_MIN        2 /* default so that the code compiles */
195 
196 /* Here, I've seen 97 slices be handled best with 2 mplexed rounds. So
197  * let's fix 64. */
198 // note that this part of the code is currently inactive
199 #define HUGE_MPLEX_MAX        64        /* default so that the code compiles */
200 
201 /* read batches of VSC_BLOCKS_ROW_BATCH rows, in order to have something
202  * which is less sensible to variations incurred by splitting the matrix
203  * */
204 // #define VSC_BLOCKS_ROW_BATCH 512
205 
206 /* blocks smaller than this amount will be merged with previous / next
207  * block */
208 // #define VSC_BLOCKS_TOO_SMALL_CUTOFF     8192
209 
210 /* The number of flush frequencies (number of staircase steps) is
211  * inversely proportional to this. The larger this value, the taller the
212  * steps, and in return, the larger the temp buffers. The effect is
213  * generally mild but noticeable.
214  */
215 // #define VSC_BLOCKS_FLUSHFREQ_RATIO 3
216 /* }}} */
217 
218 /* {{{ Examples of parameter sets */
219 /* These defaults work well for 13.4M rows/cols 4x4 submatrix of
220  * snfs247.small on a Xeon L5640 (Westmere), for a single core. Note that
221  * we define L1_CACHE_SIZE to something insane, because in fact we're
222  * utilizing L2.
223  */
224 #if 0
225 #define L1_CACHE_SIZE   192000
226 #define L2_CACHE_SIZE   1600000
227 #define DJ_CUTOFF1   8.0
228 #define DJ_CUTOFF2   24.0
229 #define LSL_NBUCKETS_MAX      256
230 #define VSC_BLOCKS_ROW_BATCH 512
231 #define VSC_BLOCKS_TOO_SMALL_CUTOFF     8192
232 #define VSC_BLOCKS_FLUSHFREQ_RATIO 3
233 #endif
234 
235 /* This parameter set is successful for 4.6M rows x 2.8 cols, 12x20
236  * submatrix of the same snfs247.sparse, for 4 simultaneous cores of a
237  * Xeon X3440. In effect, we're disabling large slices here, and use
238  * taller steps for vsc.
239  */
240 #if 1
241 #define L1_CACHE_SIZE   262144
242 #define L2_CACHE_SIZE   800000
243 #define DJ_CUTOFF1   8.0
244 #define DJ_CUTOFF2   24.0 /* This is so small that large slices don't show up */
245 #define LSL_NBUCKETS_MAX      32
246 #define VSC_BLOCKS_ROW_BATCH 256
247 #define VSC_BLOCKS_TOO_SMALL_CUTOFF     8192
248 #define VSC_BLOCKS_FLUSHFREQ_RATIO 8.0
249 #endif
250 
251 /* These used to be the cado-nfs defaults for a long time. They are
252  * visibly inferior to the previous parameters on the L5640. */
253 #if 0
254 #define L1_CACHE_SIZE   28000
255 #define L2_CACHE_SIZE   1600000
256 #define DJ_CUTOFF1   4.5
257 #define DJ_CUTOFF2   8.0
258 #define LSL_NBUCKETS_MAX      256
259 #define VSC_BLOCKS_ROW_BATCH 256
260 #define VSC_BLOCKS_TOO_SMALL_CUTOFF     8192
261 #define VSC_BLOCKS_FLUSHFREQ_RATIO 1.7
262 #endif
263 /* }}} */
264 
265 #define xxxDEBUG_BUCKETS
266 
267 /* This extension is used to distinguish between several possible
268  * implementations of the matrix-times-vector product.
269  */
270 #define MM_EXTENSION   "-bucket"
271 
272 /* MM_MAGIC is stored in files, so as to check compatibility. The upper
273  * word (MM_MAGIC_FAMILY) correspond to the implementation itself, the
274  * lower one (MM_MAGIC_VERSION) to the n-th binary incompatible change
275  * (make sure to bump it) */
276 #define MM_MAGIC_FAMILY        0xa003UL
277 #define MM_MAGIC_VERSION       0x1015UL
278 #define MM_MAGIC (MM_MAGIC_FAMILY << 16 | MM_MAGIC_VERSION)
279 
280 /* see matmul-basic.c */
281 #define MM_DIR0_PREFERS_TRANSP_MULT   1
282 
ptrbegin(vector<T> & v)283 template<typename T> inline T * ptrbegin(vector<T>& v) { return v.data(); }
ptrbegin(vector<T> const & v)284 template<typename T> inline T const * ptrbegin(vector<T> const & v) { return v.data(); }
ptrend(vector<T> & v)285 template<typename T> inline T * ptrend(vector<T>& v) { return v.size() + ptrbegin(v); }
ptrend(vector<T> const & v)286 template<typename T> inline T const * ptrend(vector<T> const & v) { return v.size() + ptrbegin(v); }
287 
288 #if 0
289 static unsigned int idiotic_sum(void * p, unsigned int nbytes)
290 {
291     uint8_t * q = (uint8_t *) p;
292     unsigned int res = 0x12345678;
293     for( ; nbytes-- ; q++) {
294         unsigned int z = *q;
295         z ^= z << 24;
296         res = (((res << 8) ^ (res * z)) - (z ^ 0xabababab)) ^ res >> 13;
297     }
298     return res;
299 }
300 #endif
301 
302 #define SLICE_TYPE_NONE           0
303 #define SLICE_TYPE_SMALL1 1
304 #define SLICE_TYPE_SMALL1_VBLOCK         2
305 #define SLICE_TYPE_SMALL2         3
306 
307 #define SLICE_TYPE_LARGE_ENVELOPE 4
308 
309 /* When coefficients go through several passes, they are considered as
310  * one slice for each pass. Note that the ordering of some slices is
311  * mandated by the design, e.g. slices of type LARGE_ASB must follow
312  * immediately their parent LARGE_FBI
313  */
314 
315 #if 0
316 #define SLICE_TYPE_LARGE_FBI      5
317 #define SLICE_TYPE_LARGE_ASB      6
318 #endif
319 
320 #define SLICE_TYPE_HUGE_ENVELOPE  7
321 
322 #if 0
323 #define SLICE_TYPE_HUGE_FBI       8
324 #define SLICE_TYPE_HUGE_FBD       9
325 #define SLICE_TYPE_HUGE_ASB       10
326 #endif
327 
328 #define SLICE_TYPE_DEFER_ENVELOPE       11
329 #define SLICE_TYPE_DEFER_COLUMN         12
330 #define SLICE_TYPE_DEFER_DIS            13
331 #define SLICE_TYPE_DEFER_ROW            14
332 #define SLICE_TYPE_DEFER_CMB        15
333 
334 #define SLICE_TYPE_MAX  16
335 
336 
slice_name(int s)337 static inline const char* slice_name(int s) {
338     switch(s) {
339         case SLICE_TYPE_SMALL1_VBLOCK: return "small1v";
340         case SLICE_TYPE_SMALL1: return "small1";
341         case SLICE_TYPE_SMALL2: return "small2";
342         case SLICE_TYPE_LARGE_ENVELOPE: return "large";
343         case SLICE_TYPE_HUGE_ENVELOPE: return "huge";
344         case SLICE_TYPE_DEFER_ENVELOPE: return "defer";
345         case SLICE_TYPE_DEFER_COLUMN: return "d-col";
346         case SLICE_TYPE_DEFER_DIS: return "d-dis";
347         case SLICE_TYPE_DEFER_ROW: return "d-row";
348         case SLICE_TYPE_DEFER_CMB: return "d-cmb";
349         default: return "other";
350     }
351 }
352 
353 struct slice_header_t {
354     uint16_t t;
355     uint16_t nchildren;  // w.r.t the tree-like shape of the block structure.
356     uint8_t pad[4];      // make the structure 256-bit wide.
357     uint32_t i0, i1;
358     uint32_t j0, j1;
359     uint64_t ncoeffs;
360 };
361 
362 /* Ok, there's only one field for the moment. But there might be more
363  * eventually */
364 struct slice_runtime_stats {
365     double t;
slice_runtime_statsslice_runtime_stats366     slice_runtime_stats() : t(0) {}
367 };
368 
369 struct matmul_bucket_methods {
370     int small1;
371     int small2;
372     int large;
373     int huge;
374     int vsc;
matmul_bucket_methodsmatmul_bucket_methods375     inline matmul_bucket_methods(const char * desc = NULL) {
376         small1=small2=large=vsc=huge=0;
377         if (!desc) {
378             /* default configuration */
379             small1=small2=large=vsc=1;
380             return;
381         }
382         const char * p = desc;
383         for( ; ; ) {
384             const char * q = strchr(p, ',');
385             int n = q ? q-p : strlen(p);
386             if (strncmp(p, "small1", n) == 0) {
387                 ASSERT_ALWAYS(!small1);
388                 small1=1;
389             } else if (strncmp(p, "small2", n) == 0) {
390                 ASSERT_ALWAYS(!small2);
391                 small2=1;
392             } else if (strncmp(p, "large", n) == 0) {
393                 ASSERT_ALWAYS(!large);
394                 large=1;
395             } else if (strncmp(p, "huge", n) == 0) {
396                 ASSERT_ALWAYS(!huge && !vsc);
397                 huge=1;
398             } else if (strncmp(p, "vsc", n) == 0) {
399                 ASSERT_ALWAYS(!huge && !vsc);
400                 vsc=1;
401             } else {
402                 fprintf(stderr, "Parse error: %s\n", p);
403             }
404             if (!q) break;
405             p = q + 1;
406         }
407         ASSERT_ALWAYS(small1||!small2);
408         ASSERT_ALWAYS(small1||small2||large||vsc||huge);
409     }
operator <matmul_bucket_methods410     inline bool operator<(matmul_bucket_methods const& o) {
411         if (vsc != o.vsc) return vsc < o.vsc;
412         if (huge != o.huge) return huge < o.huge;
413         if (large != o.large) return large < o.large;
414         if (small1 != o.small1) return small1 < o.small1;
415         if (small2 != o.small2) return small2 < o.small2;
416         return false;
417     }
something_beyondmatmul_bucket_methods418     inline bool something_beyond(const char * s) const {
419         matmul_bucket_methods o(s);
420         if (o.huge || o.vsc) return false;
421         if (o.large) return huge || vsc;
422         if (o.small1 || o.small2) return large || huge || vsc;
423         return true;
424     }
425 };
426 
427 struct matmul_bucket_data_s {
428     /* repeat the fields from the public_ interface */
429     struct matmul_public_s public_[1];
430     /* now our private fields */
431     abdst_field xab;
432     size_t npack;
433     size_t scratch1size;
434     size_t scratch2size;
435     size_t scratch3size;
436     abelt * scratch1;
437     abelt * scratch2;
438     abelt * scratch3;
439     vector<uint16_t> t16;       /* For small (dense) slices */
440     vector<uint8_t> t8;         /* For large (sparse) slices */
441     vector<unsigned int> aux;   /* Various descriptors -- fairly small */
442     /* headers are the first thing found in memory */
443     vector<slice_header_t> headers;
444     slice_runtime_stats main_timing;
445     vector<slice_runtime_stats> slice_timings;
446     matmul_bucket_methods methods;
447 };
448 
MATMUL_NAME(clear)449 void MATMUL_NAME(clear)(matmul_ptr mm0)
450 {
451     struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
452     if (mm->scratch1) abvec_clear(mm->xab, &mm->scratch1, mm->scratch1size);
453     if (mm->scratch2) abvec_clear(mm->xab, &mm->scratch2, mm->scratch2size);
454     if (mm->scratch3) abvec_clear(mm->xab, &mm->scratch3, mm->scratch3size);
455 
456     matmul_common_clear(mm->public_);
457     // delete properly calls the destructor for members as well.
458     delete mm;
459 }
460 
461 static void mm_finish_init(struct matmul_bucket_data_s * mm);
462 
MATMUL_NAME(init)463 matmul_ptr MATMUL_NAME(init)(void * pxx, param_list pl, int optimized_direction)
464 {
465     abdst_field xx = (abdst_field) pxx;
466     struct matmul_bucket_data_s * mm;
467     mm = new matmul_bucket_data_s;
468 
469     /* We first make sure that everything gets to zero ; except that it
470      * may be more complicated than just calling memset, because we have
471      * some vector() types in there...
472      */
473     memset(mm->public_, 0, sizeof(struct matmul_public_s));
474     // memset(mm, 0, sizeof(struct matmul_bucket_data_s));
475     mm->scratch1size = 0; mm->scratch1 = NULL;
476     mm->scratch2size = 0; mm->scratch2 = NULL;
477     mm->scratch3size = 0; mm->scratch3 = NULL;
478 
479     mm->xab = xx;
480 
481     unsigned int npack = L1_CACHE_SIZE;
482     if (pl) param_list_parse_uint(pl, "l1_cache_size", &npack);
483     npack /= sizeof(abelt);
484     mm->npack = npack;
485 
486     unsigned int scratch1size = L2_CACHE_SIZE/2;
487     if (pl) param_list_parse_uint(pl, "l2_cache_size", &scratch1size);
488     scratch1size /= sizeof(abelt);
489     mm->scratch1size = scratch1size;
490 
491     unsigned int scratch2size;
492     scratch2size = scratch1size * HUGE_MPLEX_MAX; // (1 << 24)/sizeof(abelt));
493     mm->scratch2size = scratch2size;
494 
495     int suggest = optimized_direction ^ MM_DIR0_PREFERS_TRANSP_MULT;
496     mm->public_->store_transposed = suggest;
497     if (pl) {
498         param_list_parse_int(pl, "mm_store_transposed",
499                 &mm->public_->store_transposed);
500         if (mm->public_->store_transposed != suggest) {
501             fprintf(stderr, "Warning, mm_store_transposed"
502                     " overrides suggested matrix storage ordering\n");
503         }
504     }
505 
506     const char * tmp = NULL;
507 
508     if (pl)
509         tmp = param_list_lookup_string(pl, "matmul_bucket_methods");
510 
511     mm->methods = matmul_bucket_methods(tmp);
512 
513     return (matmul_ptr)(mm);
514 }
515 
516 /* This moves an element at the tail of a list with no copy, transferring
517  * the ownership to the container argument */
518 template<typename T>
transfer(list<T> * ctr,T * elem)519 void transfer(list<T> * ctr, T * elem)
520 {
521     ctr->push_back(T());
522     swap(ctr->back(), *elem);
523 }
524 
underlying_type_max(T const &)525 template<typename T> T underlying_type_max(T const&) { return numeric_limits<T>::max(); }
underlying_type_min(T const &)526 template<typename T> T underlying_type_min(T const&) { return numeric_limits<T>::min(); }
527 template<typename T, typename U> T safe_assign(T & a, U const& b) {
528     ASSERT_ALWAYS(b <= numeric_limits<T>::max());
529     ASSERT_ALWAYS(b >= numeric_limits<T>::min());
530     return a = b;
531 }
532 
533 struct builder {
534     uint32_t * data[2];
535     // rowhead is a pointer which is naturally excpected to *move* within
536     // the data[0] array.
537     uint32_t * rowhead;
538     uint32_t nrows_t;
539     uint32_t ncols_t;
540     const char * rowname;
541     const char * colname;
542     /* Statistics on the prime parameter affecting performance of
543      * subroutines. All are computed as (total sum, #samples) */
544     struct matmul_bucket_data_s * mm;
545 };
546 
builder_init(builder * mb,struct matmul_bucket_data_s * mm,uint32_t * data)547 static void builder_init(builder * mb, struct matmul_bucket_data_s * mm, uint32_t * data)
548 {
549     memset(mb, 0, sizeof(struct builder));
550     ASSERT_ALWAYS(data);
551     mb->data[0] = data;
552     mb->data[1] = NULL;
553 
554     mb->nrows_t = mm->public_->dim[ mm->public_->store_transposed];
555     mb->ncols_t = mm->public_->dim[!mm->public_->store_transposed];
556 
557     mb->rowname = rowcol[ mm->public_->store_transposed];
558     mb->colname = rowcol[!mm->public_->store_transposed];
559     mb->rowhead = mb->data[0];
560     mb->mm = mm;
561 }
562 
builder_clear(builder * mb)563 static void builder_clear(builder * mb)
564 {
565     free(mb->data[0]);
566     memset(mb, 0, sizeof(struct builder));
567 }
568 
569 /************************************/
570 /* Elementary (!) buliding routines */
571 
572 /* there's room for a slice type where data is stored row-major. It would
573  * make it possible to avoid most stores, for a nice performance gain. We
574  * might even consider accumulating four rows at a time to four different
575  * registers, so that the loads are compensated. The in-memory format
576  * must account for that.
577  */
578 
579 /* {{{ small slices */
580 
581 struct small_slice_t {
582     uint32_t i0;
583     uint32_t i1;
584     unsigned int ncoeffs;
585     double dj_avg;
586     uint32_t dj_max;
587     int is_small2;
588 
589     typedef std::vector<std::pair<uint16_t, uint16_t> > Lv_t;
590     typedef Lv_t::const_iterator Lvci_t;
591     typedef Lv_t::iterator Lvi_t;
592 
593     typedef std::deque<Lv_t> Lu_t;
594     typedef Lu_t::const_iterator Luci_t;
595     typedef Lu_t::iterator Lui_t;
596 
597     Lu_t Lu;
598 };
599 
builder_do_small_slice(builder * mb,struct small_slice_t * S,uint32_t i0,uint32_t i1)600 static int builder_do_small_slice(builder * mb, struct small_slice_t * S, uint32_t i0, uint32_t i1)
601 {
602     S->i0 = i0;
603     S->i1 = i1;
604     S->ncoeffs = 0;
605     ASSERT_ALWAYS(i1-i0 <= (1 << 16) );
606 
607     /* Make enough vstrips */
608     S->Lu.assign(iceildiv(mb->ncols_t, 1UL << 16), small_slice_t::Lv_t());
609     uint32_t * ptr0 = mb->rowhead;
610     /* We're doing a new slice */
611     for(uint32_t i = i0 ; i < i1 ; i++) {
612         for(unsigned int j = 0 ; j < *mb->rowhead ; j++) {
613             uint32_t jj = mb->rowhead[1+j];
614             S->Lu[jj>>16].push_back(std::make_pair(jj%(1<<16),i-i0));
615             S->ncoeffs++;
616         }
617         mb->rowhead += 1 + *mb->rowhead;
618     }
619     /* L is the list of (column index, row index) of all
620      * coefficients in the current horizontal slice */
621 
622     /* Convert all j indices to differences */
623     S->dj_max = 0;
624     S->dj_avg = mb->ncols_t / (double) (S->ncoeffs + !S->ncoeffs);
625 
626     typedef small_slice_t::Lui_t Lui_t;
627     typedef small_slice_t::Lvci_t Lvci_t;
628     uint32_t lu_offset = 0;
629     uint32_t j = 0;
630     for(Lui_t lup = S->Lu.begin() ; lup != S->Lu.end() ; ++lup, lu_offset+=1<<16) {
631         std::sort(lup->begin(), lup->end());
632         for(Lvci_t lvp = lup->begin() ; lvp != lup->end() ; ++lvp) {
633             uint32_t dj = (lu_offset + lvp->first) - j;
634             j = lu_offset + lvp->first;
635             if (dj > S->dj_max) { S->dj_max = dj; }
636         }
637     }
638 
639     /* There are two possible reasons for this slice to be discarded.
640      * Either the max dj is too large -- larger than 16 bits -- or the
641      * average dj is too large -- larger than some guessed value.
642      */
643     /* Now that we're splitting in vblocks anyway, it's not important to
644      * constrain dj below 2^16 for small1 */
645     int keep1 = 1; // S->dj_max < (1UL << 16);
646     int keep2 = S->dj_max < (1 << SMALL_SLICES_DJ_BITS);
647 
648     if ((i1-i0) >> SMALL_SLICES_I_BITS) keep2=0;
649 
650     if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
651         if (!keep1) printf(" [cannot be small1, beyond impl limits]");
652         if (!keep2) printf(" [cannot be small2, beyond impl limits]");
653     }
654 
655     if (!mb->mm->methods.small2) keep2=0;
656 
657     if (mb->mm->methods.something_beyond("small1,small2")) {
658         /* Then we have more than just small slices. So we're enforcing
659          * our separation criterion based on DJ */
660         keep1 = keep1 && S->dj_avg < DJ_CUTOFF1;
661         keep2 = keep2 && S->dj_avg < DJ_CUTOFF1;
662     }
663 
664     S->is_small2 = keep2;
665 
666     if (!keep1 && !keep2) {
667         mb->rowhead = ptr0;
668     }
669     return keep1 || keep2;
670 }
671 
672 /* }}} */
673 
674 /* {{{ large slices */
675 
676 struct large_slice_vblock_t {
677     uint32_t j0;
678     uint32_t j1;
679     unsigned int n;             // number of coeffs.
680     unsigned int pad;           // (half the) number of padding coeffs.
681     vector<uint8_t> t8c;        // read: contribution to t8.
682     vector<unsigned int> auxc;  // read: contribution to aux.
683 };
684 
685 struct large_slice_t {
686     slice_header_t hdr[1];
687     double dj_avg;
688     uint32_t dj_max;
689     list<large_slice_vblock_t> vbl;
690 };
691 
692 struct large_slice_raw_t {
693     vector<uint8_t> ind[LSL_NBUCKETS_MAX];
694     vector<uint8_t> main;
695     vector<uint32_t> col_sizes;
696     vector<uint32_t> pad_sizes;
697 };
698 
split_large_slice_in_vblocks(builder * mb,large_slice_t * L,large_slice_raw_t * R,unsigned int scratch1size)699 static void split_large_slice_in_vblocks(builder * mb, large_slice_t * L, large_slice_raw_t * R, unsigned int scratch1size)
700 {
701     /* Now split into vslices */
702     uint8_t * mp = ptrbegin(R->main);
703     uint8_t * ip[LSL_NBUCKETS_MAX];
704     for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
705         ip[k] = ptrbegin(R->ind[k]);
706     }
707     unsigned int vblocknum = 0;
708     double vbl_ncols_variance = 0;
709     for(uint32_t j = 0 ; j < mb->ncols_t ; vblocknum++) {
710         uint32_t j0 = j;
711         size_t n = 0;
712         size_t np = 0;
713         for( ; j < mb->ncols_t ; j++) {
714             size_t dn = R->col_sizes[j];
715             size_t dnp = R->pad_sizes[j];
716             if (n + dn + 2 * (np + dnp) > scratch1size) {
717                 /* If dn doesn't fit, then we're in really, really big
718                  * trouble ! */
719                 ASSERT_ALWAYS(n != 0);
720                 break;
721             }
722             n += dn;
723             np += dnp;
724         }
725         uint32_t j1 = j;
726 
727         large_slice_vblock_t V;
728         V.j0 = j0;
729         V.j1 = j1;
730         /* First the main block, then all the bucket blocks. These come
731          * in order. */
732         /* We have nf coeffs, counting padding. This makes 2nf bytes in
733          * main, and nf bytes in the bucket blocks */
734         V.n = n;
735         V.pad = np;
736         size_t nf = n + 2 * np;
737         V.auxc.reserve(258);
738         V.auxc.push_back(V.j1 - V.j0);
739         /* XXX Notice that the reader process will consider as a j0 index
740          * the farthest possible index from the previous vblock, which
741          * means that we _must_ start with a zero in the main block here
742          * no matter what -- even if with regard to the way data is set
743          * up at the upper level (huge unique vblock), we would have had
744          * a positive integer. This digression does not apply to the
745          * first vblock.
746          *
747          * In the case where we're starting with a padding coefficient,
748          * then we might actually end up needing to cancel out _several_
749          * coefficients, since the padding coeffs need to be cancelled as
750          * well.
751          */
752         V.t8c.resize(3 * nf);
753         uint8_t * q = ptrbegin(V.t8c);
754         memcpy(q, mp, 2 * nf * sizeof(uint8_t));
755         if (vblocknum) {
756             uint8_t * pmp = q;
757             for(size_t k = 0 ; k + 2 <= nf ; k+=2, pmp+=4) {
758                 if (pmp[0] != 255 || pmp[1] || pmp[2] || pmp[3])
759                     break;
760                 pmp[0] = 0;
761             }
762             if (pmp-q) {
763                 /* It's ok to cancel padding coefficients, anyway it
764                  * should be very rare. However, it would be bad to
765                  * start cancelling a huge number of these, because then we
766                  * would be better off adjusting the pointers (peeling
767                  * off the phony computation at the beggining of mf[]).
768                  * Problem is that in such a case, we would also have to
769                  * adjust the indirection pointer as well, which would be
770                  * messy. */
771                 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
772                         "*** cancelled %td padding coefficient\n", (pmp-q)/4);
773             }
774             pmp[0] = 0;
775         }
776         unsigned int ind_sizes[LSL_NBUCKETS_MAX] = {0,};
777         q += 2*nf;
778         for(size_t k = 0 ; k < nf ; k++) {
779             ind_sizes[mp[2 * k + 1]]++;
780         }
781         mp += 2 * nf;
782         V.auxc.push_back(nf);
783         for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
784             ASSERT(ip[k]+ind_sizes[k] <= ptrend(R->ind[k]));
785             memcpy(q, ip[k], ind_sizes[k] * sizeof(uint8_t));
786             q += ind_sizes[k];
787             ip[k] += ind_sizes[k];
788             V.auxc.push_back(ind_sizes[k]);
789 
790             // mb->asb_avg[0] += ind_sizes[k];
791             // mb->asb_avg[1] += MIN(L->hdr->i1 - L->hdr->i0 - k * 256, 256);
792         }
793         ASSERT(q == ptrend(V.t8c));
794 
795         vbl_ncols_variance += ((double)(j1-j0)) * ((double)(j1-j0));
796 #if 0
797         if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
798             printf(" vbl%u", vblocknum);
799             printf(": %ss %u..%u ", mb->colname, j0, j1);
800             printf("; w %u", V.n);
801             printf("; avg dj %.1f", (j1 - j0) / (double) V.n);
802             if (V.pad) printf("; pad 6*%u", V.pad);
803             printf("\n");
804         }
805 #endif
806         transfer(&(L->vbl), &V);
807     }
808     double vbl_ncols_mean = mb->ncols_t;
809     vbl_ncols_mean /= vblocknum;
810     vbl_ncols_variance /= vblocknum;
811     double vbl_ncols_sdev = sqrt(vbl_ncols_variance - vbl_ncols_mean * vbl_ncols_mean);
812     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
813             " %u vblocks, sdev/avg = %.2f\n", vblocknum, vbl_ncols_sdev / vbl_ncols_mean);
814 
815 }
816 
817 /* Do the partial transpose. Since we're handling sparse areas, it's
818  * quite a bit faster than doing the full-blown transpose (even though
819  * transposing may be needed anyway because the input matrix is in the
820  * wrong order, it saves us hassles to only view the matrix in one given
821  * way.
822  *
823  * Note though that there _is_ some loss ; when testing, we no longer
824  * have access to the pre-computed transposed file, since we recompute
825  * the transposed parts each time.
826  */
827 
do_partial_transpose(builder * mb,vector<uint32_t> & cs,uint32_t i0,uint32_t i1)828 static uint32_t * do_partial_transpose(builder * mb, vector<uint32_t> & cs, uint32_t i0, uint32_t i1) /*{{{*/
829 {
830     uint32_t * cols;
831     uint32_t * qptr;
832 
833     uint32_t * ptr = mb->rowhead;
834     for(uint32_t i = i0 ; i < i1 ; i++) {
835         uint32_t w = *ptr++;
836         for( ; w-- ; ) {
837             uint32_t j = *ptr++;
838             cs[j]++;
839         }
840     }
841     cols = new uint32_t[ptr - mb->rowhead - (i1 - i0)];
842     qptr = cols;
843 
844     vector<uint32_t *> colptrs;
845     for(uint32_t j = 0 ; j < mb->ncols_t ; j++) {
846         colptrs.push_back(qptr);
847         qptr += cs[j];
848     }
849     ASSERT(qptr-cols == ptr - mb->rowhead - (ptrdiff_t) (i1 - i0));
850     ptr = mb->rowhead;
851     for(uint32_t i = i0 ; i < i1 ; i++) {
852         uint32_t w = *ptr++;
853         for( ; w-- ; ) {
854             uint32_t j = *ptr++;
855             *colptrs[j]++ = i - i0;
856         }
857     }
858     colptrs.clear();
859     mb->rowhead = ptr;
860     return cols;
861 }/*}}}*/
862 
builder_do_large_slice(builder * mb,struct large_slice_t * L,uint32_t i0,uint32_t i1,uint32_t imax,unsigned int scratch1size)863 static int builder_do_large_slice(builder * mb, struct large_slice_t * L, uint32_t i0, uint32_t i1, uint32_t imax, unsigned int scratch1size)
864 {
865     memset(L->hdr, 0, sizeof(slice_header_t));
866     L->hdr->t = SLICE_TYPE_LARGE_ENVELOPE;
867     L->hdr->j0 = 0;
868     L->hdr->j1 = mb->ncols_t;
869     L->hdr->i0 = i0;
870     L->hdr->i1 = i1;
871     L->hdr->ncoeffs = 0;
872 
873     L->dj_max = 0;
874 
875     large_slice_raw_t R[1];
876 
877     R->col_sizes.assign(mb->ncols_t, 0);
878     R->pad_sizes.assign(mb->ncols_t, 0);
879 
880     uint32_t * ptr0 = mb->rowhead;
881     uint32_t * cols = do_partial_transpose(mb, R->col_sizes, i0, i1);
882     uint32_t * qptr = cols;
883 
884     uint32_t last_j = 0;
885 
886     /* First we create a huge unique vblock, and later on decide on
887      * how to split it. */
888     for(uint32_t j = 0 ; j < mb->ncols_t ; j++) {
889         uint32_t len  = R->col_sizes[j];
890 
891         if (!len) continue;
892 
893         uint32_t diff = j - last_j;
894         if (diff > L->dj_max) {
895             L->dj_max = diff;
896         }
897         for( ; diff > 255 ; ) {
898             R->main.push_back(255);
899             R->main.push_back(0);
900             R->ind[0].push_back(0);
901             R->main.push_back(0);
902             R->main.push_back(0);
903             R->ind[0].push_back(0);
904             diff -= 255;
905             R->pad_sizes[j] += 1;
906         }
907 
908         /* Only the first entry in the column has a non-zero dj. So place
909          * 0 instead inside the loop, and fix it in the end. */
910         R->main.push_back(diff);
911         last_j = j;
912         for( ; len-- ; ) {
913             uint32_t i = *qptr++;
914             uint8_t w = i / 256;
915             R->main.push_back(w);
916             R->main.push_back(0);
917             R->ind[w].push_back((uint8_t) i);
918             L->hdr->ncoeffs ++;
919         }
920         R->main.pop_back();
921     }
922     ASSERT(qptr-cols == mb->rowhead - ptr0 - (ptrdiff_t) (i1 - i0));
923     qptr = NULL;
924     delete[] cols;
925     cols = NULL;
926 
927     /* The average dj is quite easy */
928     L->dj_avg = mb->ncols_t / (double) L->hdr->ncoeffs;
929 
930     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
931             " w=%" PRIu64 ", avg dj=%.1f, max dj=%u, bucket hit=1/%.1f",
932             L->hdr->ncoeffs, L->dj_avg, L->dj_max, LSL_NBUCKETS_MAX * L->dj_avg);
933 
934     if (mb->mm->methods.something_beyond("large")) {
935         /* Then we may enforce our separation criterion */
936         if (L->dj_avg > DJ_CUTOFF2) {
937             verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
938                     "-> too sparse");
939             if (imax - i0 < HUGE_MPLEX_MIN * LSL_NBUCKETS_MAX * 256) {
940                 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
941                         "; kept because of short tail\n");
942             } else {
943                 /* We won't keep this slice */
944                 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD, "\n");
945                 mb->rowhead = ptr0;
946                 return 0;
947             }
948         }
949     }
950 
951     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
952             "\n");
953 
954     split_large_slice_in_vblocks(mb, L, R, scratch1size);
955 
956     safe_assign(L->hdr->nchildren, L->vbl.size());
957 
958     return 1;
959 }
960 
961 /* }}} */
962 
963 /* {{{ huge slices */
964 
965 struct huge_slice_t {
966     slice_header_t hdr[1];
967     double dj_avg;
968     uint32_t dj_max;
969     unsigned int nlarge;
970     list<large_slice_vblock_t> vbl;
971 };
972 
973 struct huge_subslice_raw_t {
974     vector<uint8_t> ind[LSL_NBUCKETS_MAX];
975     vector<uint8_t> main;
976 };
977 
978 struct huge_subslice_ptrblock_t {
979     uint8_t * ip[LSL_NBUCKETS_MAX];
980     uint8_t * mp;
981 };
982 
983 struct huge_slice_raw_t {
984     vector<uint8_t> super;
985     vector<huge_subslice_raw_t> subs;
986     vector<uint32_t> col_sizes;
987     vector<uint32_t> pad_sizes;
988 };
989 
split_huge_slice_in_vblocks(builder * mb,huge_slice_t * H,huge_slice_raw_t * R,unsigned int scratch2size)990 static void split_huge_slice_in_vblocks(builder * mb, huge_slice_t * H, huge_slice_raw_t * R, unsigned int scratch2size)/*{{{*/
991 {
992     /* Now split into vslices */
993     // unsigned int lsize = iceildiv(H->hdr->i1 - H->hdr->i0, H->nlarge);
994     uint8_t * sp = ptrbegin(R->super);
995     vector<huge_subslice_ptrblock_t> ptrs(R->subs.size());
996     for(unsigned int i = 0 ; i < R->subs.size() ; i++) {
997         ptrs[i].mp = ptrbegin(R->subs[i].main);
998         for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
999             ptrs[i].ip[k] = ptrbegin(R->subs[i].ind[k]);
1000         }
1001     }
1002     double vbl_ncols_variance = 0;
1003     unsigned int vblocknum = 0;
1004     for(uint32_t j = 0 ; j < mb->ncols_t ; vblocknum++) {
1005         uint32_t j0 = j;
1006 #if 0
1007         /* First way to advance to the next j -- compare the total number
1008          * of coeffs to the scratch2size argument */
1009         size_t n = 0;
1010         size_t np = 0;
1011         for( ; j < mb->ncols_t ; j++) {
1012             size_t dn = R->col_sizes[j];
1013             size_t dnp = R->pad_sizes[j];
1014             if (n + dn + 2 * (np + dnp) > scratch2size) {
1015                 /* If dn doesn't fit, then we're in really, really big
1016                  * trouble ! */
1017                 ASSERT_ALWAYS(n != 0);
1018                 break;
1019             }
1020             n += dn;
1021             np += dnp;
1022         }
1023         uint32_t j1 = j;
1024 #else
1025         /* However, there's another way. We might as well arrange so that
1026          * no mega bucket exceeds the L2 cache. */
1027         uint32_t j1 = j;
1028         uint8_t * sp0 = sp;
1029         uint8_t * spc = sp0;
1030         /* XXX Notice that the reader process will consider as a j0 index
1031          * the farthest possible index from the previous vblock, which
1032          * means that we _must_ start with a zero in the main block here
1033          * no matter what -- even if with regard to the way data is set
1034          * up at the upper level (huge unique vblock), we would have had
1035          * a positive integer. This digression does not apply to the
1036          * first vblock. */
1037         if (vblocknum) {
1038             sp0[0]=0;
1039         }
1040         {
1041             vector<unsigned int> Lsizes(H->nlarge, 0);
1042             for( ; j < mb->ncols_t && sp != ptrend(R->super) ; ) {
1043                 unsigned int dj = *sp;
1044                 j += dj;
1045                 if (dj) {
1046                     /* beginning of a column. */
1047                     spc = sp;
1048                     j1 = j;
1049                 }
1050                 sp++;
1051                 unsigned int k = *sp++;
1052                 Lsizes[k]++;
1053                 if (Lsizes[k] > scratch2size / HUGE_MPLEX_MAX) {
1054                     Lsizes[k]--;
1055                     sp = spc;
1056                     break;
1057                 }
1058             }
1059             if (sp == ptrend(R->super)) {
1060                 spc = sp;
1061                 j1 = mb->ncols_t;
1062             }
1063         }
1064         /* TODO: abandon the n/np distinction. It's useful for debugging
1065          * at best, and it would not hurt to have it displayed only at the
1066          * upper level.
1067          */
1068         size_t n = 0;
1069         size_t np = 0;
1070         for(j = j0 ; j < j1 ; j++) {
1071             n += R->col_sizes[j];
1072             np += R->pad_sizes[j];
1073         }
1074         ASSERT_ALWAYS((n+2*np)*2 == (size_t) (spc - sp0));
1075         sp = sp0;
1076 #endif
1077 
1078         /* That's the same type, although it contains more stuff. */
1079         large_slice_vblock_t V;
1080         V.j0 = j0;
1081         V.j1 = j1;
1082         V.auxc.reserve(1 + H->nlarge * (1 + LSL_NBUCKETS_MAX));
1083         V.auxc.push_back(V.j1 - V.j0);
1084         /* First the super block, and then for each contained large
1085          * slice, the main block, then all the bucket blocks. These come
1086          * in order. */
1087         /* We have nf coeffs, counting padding. This makes 2nf bytes in
1088          * super, nf in mains, and nf in the bucket blocks */
1089         V.n = n;
1090         V.pad = np;
1091         size_t nf = n + 2 * np;
1092         V.t8c.resize(4 * nf);
1093         uint8_t * q = ptrbegin(V.t8c);
1094         memcpy(q, sp, 2 * nf * sizeof(uint8_t));
1095         /* TODO: If the solution above is to be kept, then of course we'd
1096          * rather reuse this thing above, since it has already been
1097          * computed.
1098          */
1099         vector<unsigned int> L_sizes(H->nlarge, 0);
1100         q += 2*nf;
1101         for(size_t k = 0 ; k < nf ; k++) {
1102             ASSERT(sp[2 * k + 1] < H->nlarge);
1103             L_sizes[sp[2 * k + 1]]++;
1104         }
1105         sp += 2 * nf;
1106         V.auxc.push_back(nf);
1107         for(unsigned int l = 0 ; l < H->nlarge ; l++) {
1108             V.auxc.push_back(L_sizes[l]);
1109         }
1110         for(unsigned int l = 0 ; l < H->nlarge ; l++) {
1111             unsigned int ind_sizes[LSL_NBUCKETS_MAX] = {0,};
1112             memcpy(q, ptrs[l].mp, L_sizes[l] * sizeof(uint8_t));
1113             q += L_sizes[l];
1114             for(unsigned int k = 0 ; k < L_sizes[l] ; k++) {
1115                 ind_sizes[ptrs[l].mp[k]]++;
1116             }
1117             ptrs[l].mp += L_sizes[l];
1118 
1119             // unsigned int i_size = MIN(H->hdr->i1 - H->hdr->i0 - l * lsize, lsize);
1120 
1121             for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
1122                 ASSERT(ptrs[l].ip[k]-(ptrbegin(R->subs[l].ind[k]))+(ptrdiff_t) ind_sizes[k] <= (ptrdiff_t) R->subs[l].ind[k].size());
1123                 memcpy(q, ptrs[l].ip[k], ind_sizes[k] * sizeof(uint8_t));
1124                 q += ind_sizes[k];
1125                 ptrs[l].ip[k] += ind_sizes[k];
1126                 V.auxc.push_back(ind_sizes[k]);
1127                 // mb->asb_avg[0] += ind_sizes[k];
1128                 // mb->asb_avg[1] += MIN(i_size - k * 256, 256);
1129             }
1130         }
1131         ASSERT(q == ptrend(V.t8c));
1132 
1133         vbl_ncols_variance += ((double)(j1-j0)) * ((double)(j1-j0));
1134 
1135         /* TODO: have some sort of verbosity level */
1136 #if 0
1137         if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1138             printf(" vbl%u", vblocknum);
1139             printf(": %ss %u..%u ", mb->colname, j0, j1);
1140             printf("; w %u", V.n);
1141             printf("; avg dj %.1f", (j1 - j0) / (double) V.n);
1142             if (V.pad) printf("; pad 6*%u", V.pad);
1143             printf("\n");
1144         }
1145 #endif
1146 
1147         transfer(&(H->vbl), &V);
1148     }
1149     if (!vblocknum) {
1150         verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1151                 " 0 vblocks\n");
1152         return;
1153     }
1154     double vbl_ncols_mean = mb->ncols_t;
1155     vbl_ncols_mean /= vblocknum;
1156     vbl_ncols_variance /= vblocknum;
1157     double vbl_ncols_sdev = sqrt(vbl_ncols_variance - vbl_ncols_mean * vbl_ncols_mean);
1158     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1159             " %u vblocks, sdev/avg = %.2f\n", vblocknum, vbl_ncols_sdev / vbl_ncols_mean);
1160 }/*}}}*/
1161 
builder_do_huge_slice(builder * mb,struct huge_slice_t * H,uint32_t i0,uint32_t i1,unsigned int scratch2size)1162 static int builder_do_huge_slice(builder * mb, struct huge_slice_t * H, uint32_t i0, uint32_t i1, unsigned int scratch2size)
1163 {
1164     memset(H->hdr, 0, sizeof(slice_header_t));
1165     H->hdr->t = SLICE_TYPE_HUGE_ENVELOPE;
1166     H->hdr->j0 = 0;
1167     H->hdr->j1 = mb->ncols_t;
1168     H->hdr->i0 = i0;
1169     H->hdr->i1 = i1;
1170     H->hdr->ncoeffs = 0;
1171 
1172     H->dj_max = 0;
1173 
1174     huge_slice_raw_t R[1];
1175 
1176     R->col_sizes.assign(mb->ncols_t, 0);
1177     R->pad_sizes.assign(mb->ncols_t, 0);
1178 
1179     uint32_t * ptr0 = mb->rowhead;
1180     uint32_t * cols = do_partial_transpose(mb, R->col_sizes, i0, i1);
1181     uint32_t * qptr = cols;
1182 
1183     /* How many large slices in this huge slice ? */
1184 
1185     H->nlarge = iceildiv(i1 - i0, LSL_NBUCKETS_MAX * 256);
1186     ASSERT(H->nlarge >= HUGE_MPLEX_MIN);
1187     ASSERT(H->nlarge <= HUGE_MPLEX_MAX);
1188     unsigned int lsize = iceildiv(i1 - i0, H->nlarge);
1189     ASSERT(lsize <= LSL_NBUCKETS_MAX * 256);
1190     ASSERT(lsize * H->nlarge >= i1 - i0);
1191     R->subs.assign(H->nlarge, huge_subslice_raw_t());
1192 
1193     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1194             " (%u*%u) ", H->nlarge, lsize);
1195     uint32_t last_j = 0;
1196 
1197     /* First we create a huge unique vblock, and later on decide on
1198      * how to split it. */
1199     uint32_t next_dot = mb->ncols_t / H->nlarge;
1200     for(uint32_t j = 0 ; j < mb->ncols_t ; j++) {
1201         uint32_t len  = R->col_sizes[j];
1202 
1203         if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1204             if (j > next_dot) {
1205                 printf(".");
1206                 fflush(stdout);
1207                 next_dot += mb->ncols_t / H->nlarge;
1208             }
1209         }
1210 
1211         if (!len)
1212             continue;
1213 
1214         uint32_t diff = j - last_j;
1215         if (diff > H->dj_max) {
1216             H->dj_max = diff;
1217         }
1218         for( ; diff > 255 ; ) {
1219             R->super.push_back(255);    // dj
1220             R->super.push_back(0);      // k
1221             R->subs[0].main.push_back(0);
1222             R->subs[0].ind[0].push_back(0);
1223             R->super.push_back(0);      // dj
1224             R->super.push_back(0);      // k
1225             R->subs[0].main.push_back(0);
1226             R->subs[0].ind[0].push_back(0);
1227             diff -= 255;
1228             R->pad_sizes[j] += 1;
1229         }
1230 
1231         /* Only the first entry in the column has a non-zero dj. So place
1232          * 0 instead inside the loop, and fix it in the end. */
1233         R->super.push_back(diff);
1234         last_j = j;
1235         for( ; len-- ; ) {
1236             uint32_t i = *qptr++;
1237             uint8_t w = i / lsize;
1238             uint8_t wq = (i % lsize) / 256;
1239             uint8_t wr = i % lsize;
1240             R->super.push_back(w);
1241             R->super.push_back(0);
1242             R->subs[w].main.push_back(wq);
1243             R->subs[w].ind[wq].push_back(wr);
1244             H->hdr->ncoeffs ++;
1245         }
1246         R->super.pop_back();
1247     }
1248     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD, "\n");
1249     ASSERT(qptr-cols == mb->rowhead - ptr0 - (ptrdiff_t) (i1 - i0));
1250     qptr = NULL;
1251     delete[] cols;
1252     cols = NULL;
1253 
1254     /* The average dj is quite easy */
1255     H->dj_avg = mb->ncols_t / (double) H->hdr->ncoeffs;
1256 
1257     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1258             " w=%" PRIu64 ", avg dj=%.1f, max dj=%u, bucket block hit=1/%.1f\n",
1259             H->hdr->ncoeffs, H->dj_avg, H->dj_max,
1260             H->nlarge * H->dj_avg);
1261 
1262     if (0) {
1263         /* Don't keep -- well, here this never occurs */
1264         mb->rowhead = ptr0;
1265         return 0;
1266     }
1267 
1268     split_huge_slice_in_vblocks(mb, H, R, scratch2size);
1269 
1270     safe_assign(H->hdr->nchildren, H->vbl.size());
1271 
1272     return 1;
1273 }
1274 
1275 /* }}} */
1276 
1277 /* {{{ vertical staircase slices */
1278 
1279 struct vsc_step_t {
1280     unsigned int defer;
1281     uint32_t nrows;
1282     unsigned int density_upper_bound;
1283     unsigned long tbuf_space;
1284 };
1285 
when_flush(unsigned int k,unsigned int nv,unsigned int defer)1286 static inline unsigned int when_flush(unsigned int k, unsigned int nv, unsigned int defer)
1287 {
1288     k -= k % defer;
1289     k += defer - 1;
1290     k = min(k, nv-1);
1291     return k;
1292 }
1293 
flush_here(unsigned int k,unsigned int nv,unsigned int defer)1294 static inline int flush_here(unsigned int k, unsigned int nv, unsigned int defer) {
1295     return k == when_flush(k,nv,defer);
1296 }
1297 
1298 struct vsc_sub_slice_t {
1299     slice_header_t hdr[1];
1300     vector<uint16_t> x;
1301     vector<uint8_t> c;
1302 };
1303 
1304 struct vsc_middle_slice_t {
1305     slice_header_t hdr[1];
1306     vector<vsc_sub_slice_t> sub;
1307 };
1308 
1309 struct vsc_slice_t {
1310     /* This header is mostly a placeholder, and is not here to reflect an
1311      * additional pass on the data. */
1312     slice_header_t hdr[1];
1313     vector<vsc_step_t> steps;
1314     vector<vsc_middle_slice_t> dispatch;
1315     vector<uint32_t> transpose_help;
1316     unsigned long tbuf_space;
1317 };
1318 
1319 /* {{{ decide on our flushing periods (this step is data independent) */
flush_periods(unsigned int nvstrips)1320 static vector<unsigned int> flush_periods(unsigned int nvstrips)
1321 {
1322     vector<unsigned int> fp;
1323     for(unsigned int nf = 1 ;  ; ) {
1324         unsigned int quo = min(255u, iceildiv(nvstrips, nf));
1325         if (fp.empty() || quo < fp.back()) {
1326             fp.push_back(quo);
1327         }
1328         if (quo == 1)
1329             break;
1330         // next ?
1331         nf = max(nf + 1, (unsigned int) (nf * VSC_BLOCKS_FLUSHFREQ_RATIO));
1332     }
1333     std::reverse(fp.begin(), fp.end());
1334     if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1335         printf("Considering cells to be flushed every");
1336         for(unsigned int s = 0 ; s < fp.size() ; s++) {
1337             printf(" %u", fp[s]);
1338         }
1339         printf(" vstrips\n");
1340     }
1341     ASSERT_ALWAYS(fp.back() == min(255u, nvstrips));
1342     ASSERT_ALWAYS(fp.front() == 1);
1343 
1344     return fp;
1345 }
1346 /* }}} */
1347 
1348 /* {{{ read the row weights */
rowblock_weights(builder * mb,struct vsc_slice_t * V)1349 static vector<unsigned long> rowblock_weights(builder * mb, struct vsc_slice_t * V)
1350 {
1351     vector<unsigned long> blockweight;
1352     uint32_t * ptr = mb->rowhead;
1353     /* It's a bit unfortunate, but since here we do speedy parsing of the
1354      * row weights, we're taking a shortcut which is valid only if our
1355      * data set spans the entire column range */
1356     ASSERT_ALWAYS(V->hdr->j0 == 0 && V->hdr->j1 == mb->ncols_t);
1357     for(uint32_t i = V->hdr->i0 ; i < V->hdr->i1 ; ) {
1358         unsigned long bw = 0;
1359         for(uint32_t di = 0 ; di < VSC_BLOCKS_ROW_BATCH && i < V->hdr->i1 ; i++, di++) {
1360             unsigned int w = *ptr++;
1361             ptr += w;
1362             bw += w;
1363         }
1364         blockweight.push_back(bw);
1365     }
1366     return blockweight;
1367 }
1368 /* }}} */
1369 
1370 /*{{{*/
1371 static void
compute_staircase(builder * mb,struct vsc_slice_t * V)1372 compute_staircase(builder * mb, struct vsc_slice_t * V)
1373 {
1374     unsigned int nvstrips = V->dispatch.size();
1375 
1376     vector<unsigned long> blockweight = rowblock_weights(mb, V);
1377     vector<unsigned int> flushperiod = flush_periods(nvstrips);
1378 
1379     /* {{{ first dispatching pass */
1380     uint32_t ii = V->hdr->i0;
1381     int s = 0;
1382     for(unsigned int k = 0 ; ; k++) {
1383         if (k == blockweight.size() ||
1384                 (s < (int) flushperiod.size() - 1 &&
1385                  blockweight[k] < VSC_BLOCKS_ROW_BATCH * nvstrips / flushperiod[s]))
1386         {
1387             uint32_t old_ii = ii;
1388             ii = min(V->hdr->i0 + k * VSC_BLOCKS_ROW_BATCH, V->hdr->i1);
1389             uint32_t di = ii - old_ii;
1390             if (di) {
1391                 vsc_step_t step;
1392                 step.defer = flushperiod[s];
1393                 step.nrows = di;
1394                 step.tbuf_space = 0;    // to be set later.
1395                 step.density_upper_bound = (s == 0) ?
1396                     UINT_MAX : iceildiv(nvstrips, flushperiod[s-1]);
1397                 V->steps.push_back(step);
1398             }
1399             s++;
1400         }
1401         if (k == blockweight.size())
1402             break;
1403     }
1404     /* }}} */
1405     /* {{{ second pass: do some merges for smallish areas */
1406     ii = V->hdr->i0;
1407     for(unsigned int k = 0 ; k < V->steps.size() ; ) {
1408         unsigned long w = V->steps[k].nrows;
1409         uint32_t old_ii = ii;
1410         int merge_with_next = 0;
1411         int merge_with_previous = 0;
1412         if ((k+1) < V->steps.size())
1413             merge_with_next = w <= VSC_BLOCKS_TOO_SMALL_CUTOFF; // || w <= V->steps[k+1].nrows / 10;
1414         if (k > 0)
1415             merge_with_previous = w <= VSC_BLOCKS_TOO_SMALL_CUTOFF; // || w <= V->steps[k-1].nrows / 10;
1416         if (!merge_with_previous && !merge_with_next) {
1417             ii += w;
1418             k++;
1419             continue;
1420         }
1421         /* Otherwise it's a bit ridiculous to have a separate block for
1422          * such a small number of rows. So by default, we choose to merge
1423          * it with the next block, or the previous one if we're reaching
1424          * the end. */
1425         V->steps.erase(V->steps.begin() + k);
1426         if (merge_with_next) {
1427             verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1428                     "strip %" PRIu32 "+%lu merged with next\n",
1429                     old_ii, w);
1430             ASSERT_ALWAYS(k < V->steps.size());
1431             V->steps[k].nrows += w;
1432             // don't increment k here.
1433             // don't increment ii either.
1434         } else {
1435             ASSERT_ALWAYS(merge_with_previous);
1436             verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1437                     "strip %" PRIu32 "+%lu merged with previous\n",
1438                     old_ii, w);
1439             V->steps[k-1].nrows += w;
1440             // don't increment k here.
1441             ii += w;
1442         }
1443     }
1444     /* }}} */
1445 }
1446 /*}}}*/
1447 
1448 /*{{{*/
1449 static
vsc_fill_buffers(builder * mb,struct vsc_slice_t * V)1450 void vsc_fill_buffers(builder * mb, struct vsc_slice_t * V)
1451 {
1452     unsigned int nvstrips = V->dispatch.size();
1453     uint32_t width = iceildiv(V->hdr->j1 - V->hdr->j0, nvstrips);
1454     ASSERT_ALWAYS(width > 0);
1455     uint32_t * ptr = mb->rowhead;
1456     uint32_t i = V->hdr->i0;
1457     V->tbuf_space = 0;
1458 
1459     for(unsigned int s = 0 ; s < V->steps.size() ; s++) {
1460         /* This strip is scheduled to be flushed every V->steps[s].defer
1461          * vstrips. */
1462         unsigned int defer = V->steps[s].defer;
1463         uint32_t old_ii = i;
1464         uint32_t ii = i + V->steps[s].nrows;
1465         ASSERT_ALWAYS(ii <= V->hdr->i1);
1466         for( ; i < ii ; i++) {
1467             unsigned int w = *ptr++;
1468             for( ; w-- ; ) {
1469                 uint32_t j = *ptr++;
1470                 unsigned int d = j / width;
1471                 V->dispatch[d].sub[s].hdr->ncoeffs++;
1472                 V->dispatch[d].sub[s].x.push_back(j % width);
1473                 unsigned int fidx = when_flush(d,nvstrips,defer);
1474                 V->dispatch[fidx].sub[s].c.push_back(1 + (d % defer));
1475             }
1476             for(unsigned int d = 0 ; d < nvstrips ; d+= defer) {
1477                 /* row change */
1478                 unsigned int fidx = when_flush(d,nvstrips,defer);
1479                 V->dispatch[fidx].sub[s].c.push_back(0);
1480             }
1481         }
1482         /*
1483            if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1484                 printf("Post counts\n");
1485                 for(unsigned int d = 0 ; d < nvstrips ; d++) {
1486                     printf(" (nrows=%u) [%u].sub[%u] : %lu coeffs ; xsize=%zu, csize=%zu\n", V->steps[s].nrows, d, s, V->dispatch[d].sub[s].hdr->ncoeffs, V->dispatch[d].sub[s].x.size(), V->dispatch[d].sub[s].c.size());
1487                 }
1488             }
1489         */
1490         unsigned int acc = 0;
1491         for(unsigned int d = 0 ; d < nvstrips ; d++) {
1492             acc += V->dispatch[d].sub[s].hdr->ncoeffs;
1493             if (!flush_here(d,nvstrips,defer))
1494                 continue;
1495             ASSERT(V->dispatch[d].sub[s].c.size() == acc + V->steps[s].nrows);
1496             acc = 0;
1497         }
1498         unsigned long m = 0;
1499         unsigned long cm = 0;
1500         for(unsigned int d = 0 ; d < nvstrips ; d++) {
1501             if (d % defer == 0) cm = 0;
1502             cm += V->dispatch[d].sub[s].hdr->ncoeffs;
1503             if (cm >= m) m = cm;
1504         }
1505         if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1506             printf("Rows %" PRIu32 "+%" PRIu32, old_ii, V->steps[s].nrows);
1507             if (V->steps[s].density_upper_bound != UINT_MAX) {
1508                 printf(": d < %u", V->steps[s].density_upper_bound);
1509             }
1510             printf("; %u flushes (every %u), tbuf space %lu\n", iceildiv(nvstrips, defer), defer, m);
1511         }
1512         m += 1; // add space for one dummy pointer.
1513         V->steps[s].tbuf_space = m;
1514         V->tbuf_space += m;
1515     }
1516     for(unsigned int d = 0 ; d < nvstrips ; d++) {
1517         for(unsigned int s = 0 ; s < V->steps.size() ; s++) {
1518             V->dispatch[d].hdr->ncoeffs+=V->dispatch[d].sub[s].hdr->ncoeffs;
1519         }
1520         V->hdr->ncoeffs+=V->dispatch[d].hdr->ncoeffs;
1521     }
1522 
1523     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1524             "Total tbuf space %lu (%lu MB)\n",
1525             V->tbuf_space, (V->tbuf_space * sizeof(abelt)) >> 20);
1526 }
1527 /*}}}*/
1528 
builder_prepare_vsc_slices(builder * mb,struct vsc_slice_t * V,uint32_t i0,uint32_t imax)1529 static int builder_prepare_vsc_slices(builder * mb, struct vsc_slice_t * V, uint32_t i0, uint32_t imax)
1530 {
1531     memset(V->hdr, 0, sizeof(slice_header_t));
1532     V->hdr->t = SLICE_TYPE_DEFER_ENVELOPE;
1533     V->hdr->i0 = i0;
1534     V->hdr->i1 = imax;
1535     V->hdr->j0 = 0;
1536     V->hdr->j1 = mb->ncols_t;
1537     V->hdr->ncoeffs = 0;
1538 
1539     unsigned int nvstrips = iceildiv(V->hdr->j1 - V->hdr->j0, 1 << 16);
1540     safe_assign(V->hdr->nchildren, nvstrips);
1541 
1542     uint32_t width = iceildiv(V->hdr->j1 - V->hdr->j0, nvstrips);
1543     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1544             "%u vstrips of width %" PRIu32 "\n", nvstrips, width);
1545 
1546     // prepare the dispatch slice headers
1547     for(uint32_t j = V->hdr->j0 ; j < V->hdr->j1 ; j += width) {
1548         vsc_middle_slice_t v;
1549         memset(v.hdr, 0, sizeof(slice_header_t));
1550         v.hdr->t = SLICE_TYPE_DEFER_COLUMN;
1551         v.hdr->i0 = V->hdr->i0;
1552         v.hdr->i1 = V->hdr->i1;
1553         v.hdr->j0 = j;
1554         v.hdr->j1 = min(j + width, V->hdr->j1);
1555         /* ncoeffs is set later, as is the data array x */
1556         v.hdr->ncoeffs = 0;
1557         V->dispatch.push_back(v);
1558     }
1559     ASSERT(V->dispatch.size() == nvstrips);
1560 
1561     compute_staircase(mb, V);
1562     for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
1563         safe_assign(V->dispatch[k].hdr->nchildren, V->steps.size());
1564         uint32_t i0 = V->hdr->i0;
1565         for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1566             vsc_sub_slice_t w;
1567             memset(w.hdr, 0, sizeof(slice_header_t));
1568             w.hdr->t = SLICE_TYPE_DEFER_DIS;
1569             w.hdr->i0 = i0;
1570             w.hdr->i1 = (i0 += V->steps[l].nrows);
1571             w.hdr->j0 = V->dispatch[k].hdr->j0;
1572             w.hdr->j1 = V->dispatch[k].hdr->j1;
1573             w.hdr->ncoeffs = 0;
1574             w.hdr->nchildren = 0;
1575             V->dispatch[k].sub.push_back(w);
1576         }
1577     }
1578     vsc_fill_buffers(mb, V);
1579 
1580     return 0;
1581 }
1582 
1583 /* }}} */
1584 
1585 /*************************************************************************/
1586 /* Pushing slices to mm ; all these routines clear the given slice stack */
1587 
1588 /* {{{ small slices */
builder_push_small_slice(struct matmul_bucket_data_s * mm,small_slice_t * S)1589 static void builder_push_small_slice(struct matmul_bucket_data_s * mm, small_slice_t * S)
1590 {
1591     unsigned int ncols_t = mm->public_->dim[!mm->public_->store_transposed];
1592     if (S->is_small2) {
1593         slice_header_t hdr[1];
1594         memset(hdr, 0, sizeof(slice_header_t));
1595         hdr->t = SLICE_TYPE_SMALL2;
1596         hdr->i0 = S->i0;
1597         hdr->i1 = S->i1;
1598         hdr->j0 = 0;
1599         hdr->j1 = ncols_t;
1600         hdr->ncoeffs = S->ncoeffs;
1601         hdr->nchildren = 0;
1602 
1603         /* small2 slices are smaller, and faster -- but more
1604          * restrictive.  */
1605 
1606         /* Because a small2 slice has data in 16bits chunks, we'll
1607          * have to ensure proper alignment in the end. */
1608         ASSERT_ALWAYS((mm->t16.size() & (2-1)) == 0);
1609 
1610         unsigned int j = 0;
1611         uint32_t lu_offset = 0;
1612         for( ; ! S->Lu.empty() ; S->Lu.pop_front(), lu_offset+=1<<16) {
1613             small_slice_t::Lv_t lv;
1614             swap(lv, S->Lu.front());
1615             typedef small_slice_t::Lvci_t Lvci_t;
1616             for(Lvci_t lvp = lv.begin() ; lvp != lv.end() ; ++lvp) {
1617                 uint32_t dj = (lu_offset + lvp->first) - j;
1618                 ASSERT_ALWAYS(dj < (1 << SMALL_SLICES_DJ_BITS) );
1619                 ASSERT_ALWAYS(lvp->second < (1 << SMALL_SLICES_I_BITS) );
1620                 mm->t16.push_back((dj << SMALL_SLICES_I_BITS) | lvp->second);
1621                 j = lu_offset + lvp->first;
1622             }
1623         }
1624 
1625         // align.
1626         for( ; (mm->t16.size() & (2-1)) ; )
1627             mm->t16.push_back(0);
1628 
1629         mm->headers.push_back(*hdr);
1630     } else {
1631         /* How many vertical parts -- this merely has to do with index
1632          * wraparound, since we're processing data column-major anyway.
1633          */
1634         slice_header_t ehdr[1];
1635         vector<slice_header_t> hdrs;
1636         memset(ehdr, 0, sizeof(slice_header_t));
1637         ehdr->t = SLICE_TYPE_SMALL1;
1638         ehdr->i0 = S->i0;
1639         ehdr->i1 = S->i1;
1640         ehdr->j0 = 0;
1641         ehdr->j1 = ncols_t;
1642         ehdr->ncoeffs = S->ncoeffs;
1643 
1644         unsigned int lu_offset = 0;
1645         for( ; ! S->Lu.empty() ; S->Lu.pop_front()) {
1646             small_slice_t::Lv_t lv;
1647             swap(lv, S->Lu.front());
1648             typedef small_slice_t::Lvci_t Lvci_t;
1649 
1650             slice_header_t hdr[1];
1651             memset(hdr, 0, sizeof(slice_header_t));
1652             hdr->t = SLICE_TYPE_SMALL1_VBLOCK;
1653             hdr->i0 = S->i0;
1654             hdr->i1 = S->i1;
1655             hdr->j0 = lu_offset;
1656             hdr->nchildren = 0;
1657             hdr->ncoeffs = lv.size();
1658             for(Lvci_t lvp = lv.begin() ; lvp != lv.end() ; ++lvp) {
1659                 mm->t16.push_back(lvp->first);
1660                 mm->t16.push_back(lvp->second);
1661             }
1662             lu_offset += (1UL << 16);
1663             hdr->j1 = min(lu_offset, ncols_t);
1664             hdrs.push_back(*hdr);
1665         }
1666         safe_assign(ehdr->nchildren, hdrs.size());
1667         mm->headers.push_back(*ehdr);
1668         mm->headers.insert(mm->headers.end(), hdrs.begin(), hdrs.end());
1669     }
1670     mm->public_->ncoeffs += S->ncoeffs;
1671 }
1672 
1673 /* }}} */
1674 
1675 /* {{{ large slices */
builder_push_large_slice(struct matmul_bucket_data_s * mm,large_slice_t * L)1676 static void builder_push_large_slice(struct matmul_bucket_data_s * mm, large_slice_t * L)
1677 {
1678     mm->headers.push_back(*L->hdr);
1679     for( ; ! L->vbl.empty() ; L->vbl.pop_front()) {
1680         large_slice_vblock_t & V(L->vbl.front());
1681         mm->aux.insert(mm->aux.end(), V.auxc.begin(), V.auxc.end());
1682         unsigned t8_size =  V.t8c.size();
1683         mm->t8.insert(mm->t8.end(), V.t8c.begin(), V.t8c.end());
1684         // large slices, but not huge slices, may put an odd number
1685         // of coefficients in t8
1686         if (t8_size & 1) { mm->t8.push_back(0); }
1687     }
1688     mm->public_->ncoeffs += L->hdr->ncoeffs;
1689 }
1690 
1691 /* }}} */
1692 
1693 /* {{{ huge slices */
1694 
builder_push_huge_slice(struct matmul_bucket_data_s * mm,huge_slice_t * H)1695 static void builder_push_huge_slice(struct matmul_bucket_data_s * mm, huge_slice_t * H)
1696 {
1697     mm->headers.push_back(*H->hdr);
1698     mm->aux.push_back(H->nlarge);
1699     for( ; ! H->vbl.empty() ; H->vbl.pop_front()) {
1700         large_slice_vblock_t & V(H->vbl.front());
1701         mm->aux.insert(mm->aux.end(), V.auxc.begin(), V.auxc.end());
1702         unsigned t8_size =  V.t8c.size();
1703         ASSERT_ALWAYS((t8_size & 1) == 0);
1704         mm->t8.insert(mm->t8.end(), V.t8c.begin(), V.t8c.end());
1705     }
1706     mm->public_->ncoeffs += H->hdr->ncoeffs;
1707 }
1708 /* }}} */
1709 
1710 /******************************************/
1711 /* Iteratively call the building routines */
1712 
1713 /* {{{ small slices */
builder_do_all_small_slices(builder * mb,uint32_t * p_i0,uint32_t imax,unsigned int npack)1714 static void builder_do_all_small_slices(builder * mb, uint32_t * p_i0, uint32_t imax, unsigned int npack)
1715 {
1716     /* npack is a guess for the expected size of small slices ; they are
1717      * arranged later to all have approximately equal size.
1718      */
1719     unsigned int s;
1720     uint32_t i00 = *p_i0;
1721     unsigned int nslices = iceildiv(imax - i00, npack);
1722     uint32_t done = 0;
1723     for(s = 0 ; s < nslices ; s++) {
1724         uint32_t i0 = i00 +  s    * (uint64_t) (imax - i00) / nslices;
1725         uint32_t i1 = i00 + (s+1) * (uint64_t) (imax - i00) / nslices;
1726 
1727         small_slice_t S[1];
1728 
1729         if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1730             printf("Ssl%u: %ss %u+%u...", s, mb->rowname, i0, i1-i0);
1731             fflush(stdout);
1732         }
1733 
1734         int keep = builder_do_small_slice(mb, S, i0, i1);
1735 
1736         if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1737             printf(" w %" PRIu32 " ; avg dj %.1f ; max dj %u%s\n",
1738                     S->ncoeffs, S->dj_avg, S->dj_max,
1739                     S->is_small2 ? " [packed]" : "");
1740             fflush(stdout);
1741         }
1742 
1743         if (!keep) {
1744             verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1745                     "Switching to large slices. Ssl%u to be redone\n", s);
1746             break;
1747         }
1748         builder_push_small_slice(mb->mm, S);
1749         // transfer(Sq, S);
1750         done += i1 - i0;
1751     }
1752     *p_i0 += done;
1753 }
1754 /* }}} */
1755 
1756 /* {{{ large slices */
builder_do_all_large_slices(builder * mb,uint32_t * p_i0,unsigned int imax,unsigned int scratch1size)1757 static void builder_do_all_large_slices(builder * mb, uint32_t * p_i0, unsigned int imax, unsigned int scratch1size)
1758 {
1759     unsigned int rem_nrows = imax - *p_i0;
1760     unsigned int nlarge_slices = iceildiv(rem_nrows, LSL_NBUCKETS_MAX * 256);
1761     uint32_t done = 0;
1762     for(unsigned int s = 0 ; s < nlarge_slices ; s++) {
1763         large_slice_t L[1];
1764         uint32_t i0 = * p_i0 +  s      * (uint64_t) rem_nrows / nlarge_slices;
1765         uint32_t i1 = * p_i0 + (s + 1) * (uint64_t) rem_nrows / nlarge_slices;
1766 
1767         if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1768             printf("Lsl%u %ss %u+%u", s, mb->rowname, i0, i1-i0);
1769             fflush(stdout);
1770         }
1771 
1772         int keep = builder_do_large_slice(mb, L, i0, i1, imax, scratch1size);
1773 
1774         if (!keep) {
1775             verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1776                     "Switching to huge slices. Lsl%u to be redone\n", s);
1777             break;
1778         }
1779 
1780         // transfer(Lq, L);
1781         builder_push_large_slice(mb->mm, L);
1782         done += i1 - i0;
1783     }
1784     *p_i0 += done;
1785 }
1786 /* }}} */
1787 
1788 /* {{{ huge slices */
builder_do_all_huge_slices(builder * mb,uint32_t * p_i0,unsigned int imax,unsigned int scratch2size)1789 static void builder_do_all_huge_slices(builder * mb, uint32_t * p_i0, unsigned int imax, unsigned int scratch2size)
1790 {
1791     unsigned int rem_nrows = imax - *p_i0;
1792     unsigned int nhuge_slices = iceildiv(rem_nrows, HUGE_MPLEX_MAX * LSL_NBUCKETS_MAX * 256);
1793     uint32_t done = 0;
1794     for(unsigned int s = 0 ; s < nhuge_slices ; s++) {
1795         huge_slice_t H[1];
1796         uint32_t i0 = * p_i0 +  s      * (uint64_t) rem_nrows / nhuge_slices;
1797         uint32_t i1 = * p_i0 + (s + 1) * (uint64_t) rem_nrows / nhuge_slices;
1798 
1799         if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1800             printf("Hsl%u %ss %u+%u", s, mb->rowname, i0, i1-i0);
1801             fflush(stdout);
1802         }
1803         builder_do_huge_slice(mb, H, i0, i1, scratch2size);
1804         builder_push_huge_slice(mb->mm, H);
1805         // transfer(Hq, H);
1806         done += i1 - i0;
1807     }
1808     *p_i0 += done;
1809 }
1810 /* }}} */
1811 
1812 #define xxxCOMPRESS_COMBINERS_1
1813 #define xxxCOMPRESS_COMBINERS_2
1814 #define xxxCOMPRESS_COMBINERS_4
1815 
1816 #if defined(COMPRESS_COMBINERS_1) || \
1817     defined(COMPRESS_COMBINERS_2) || \
1818     defined(COMPRESS_COMBINERS_4)
1819 #ifdef MATMUL_SUB_VSC_COMBINE_H_
1820 #error "Please either fix or disable the assembly code for COMPRESS_COMBINERS"
1821 /* Anyway this idea doesn't work so well and changes little to the
1822  * running time. */
1823 #endif
1824 #endif
1825 
1826 
1827 /* {{{ staircase */
compressed_size(unsigned long s,unsigned int defer MAYBE_UNUSED)1828 static unsigned long compressed_size(unsigned long s, unsigned int defer MAYBE_UNUSED)
1829 {
1830     if (0) {
1831 #ifdef COMPRESS_COMBINERS_1
1832     } else if (defer == 1) {
1833         return iceildiv(s, 8);
1834 #endif
1835 #ifdef COMPRESS_COMBINERS_2
1836     } else if (defer <= 3) {
1837         return iceildiv(s, 4);
1838 #endif
1839 #ifdef COMPRESS_COMBINERS_4
1840     } else if (defer <= 15) {
1841         return iceildiv(s, 2);
1842 #endif
1843     }
1844     return s;
1845 }
1846 
append_compressed(vector<uint8_t> & t8,vector<uint8_t> const & S,unsigned int defer MAYBE_UNUSED)1847 static void append_compressed(vector<uint8_t>& t8, vector<uint8_t> const& S, unsigned int defer MAYBE_UNUSED)
1848 {
1849     if (0) {
1850 #ifdef  COMPRESS_COMBINERS_1
1851     } else if (defer == 1) {
1852         const unsigned int nbits = 1;
1853         unsigned int c = S.size();
1854         t8.reserve(t8.size() + compressed_size(S.size(), defer));
1855         for(unsigned int i = 0 ; i < c ; i += 8 / nbits) {
1856             uint8_t x = 0;
1857             for(unsigned int j = 0 ; nbits * j < 8 && i + j < c ; j++) {
1858                 x ^= S[i+j] << (nbits * j);
1859             }
1860             t8.push_back(x);
1861         }
1862 #endif
1863 #ifdef COMPRESS_COMBINERS_2
1864     } else if (defer <= 3) {
1865         const unsigned int nbits = 2;
1866         unsigned int c = S.size();
1867         t8.reserve(t8.size() + compressed_size(S.size(), defer));
1868         for(unsigned int i = 0 ; i < c ; i += 8 / nbits) {
1869             uint8_t x = 0;
1870             for(unsigned int j = 0 ; nbits * j < 8 && i + j < c ; j++) {
1871                 x ^= S[i+j] << (nbits * j);
1872             }
1873             t8.push_back(x);
1874         }
1875 #endif
1876 #ifdef COMPRESS_COMBINERS_4
1877     } else if (defer <= 15) {
1878         const unsigned int nbits = 4;
1879         unsigned int c = S.size();
1880         t8.reserve(t8.size() + compressed_size(S.size(), defer));
1881         for(unsigned int i = 0 ; i < c ; i += 8 / nbits) {
1882             uint8_t x = 0;
1883             for(unsigned int j = 0 ; nbits * j < 8 && i + j < c ; j++) {
1884                 x ^= S[i+j] << (nbits * j);
1885             }
1886             t8.push_back(x);
1887         }
1888 #endif
1889     } else {
1890         t8.insert(t8.end(), S.begin(), S.end());
1891     }
1892 }
1893 
builder_push_vsc_slices(struct matmul_bucket_data_s * mm,vsc_slice_t * V)1894 static void builder_push_vsc_slices(struct matmul_bucket_data_s * mm, vsc_slice_t * V)
1895 {
1896     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1897             "Flushing staircase slices\n");
1898 
1899     mm->headers.push_back(*V->hdr);
1900 
1901     mm->aux.push_back(V->dispatch.size());
1902     mm->aux.push_back(V->steps.size());
1903 
1904     for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1905         mm->aux.push_back(V->steps[l].defer);
1906         mm->aux.push_back(V->steps[l].tbuf_space);
1907     }
1908 
1909     /* all headers */
1910     for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
1911         mm->headers.push_back(*V->dispatch[k].hdr);
1912     }
1913     for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
1914         for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1915             mm->headers.push_back(*V->dispatch[k].sub[l].hdr);
1916         }
1917     }
1918 
1919     /* We also add header information for the combination operations,
1920      * even though the payload is stored alongside with the dispatching
1921      * stuff for convenience. We do need the combination headers for
1922      * properly keeping track of where the time is spent eventually.
1923      *
1924      * The combining headers are stored en route while flushing
1925      * combination points.
1926      */
1927 
1928     unsigned int cidx = mm->headers.size();
1929 
1930     for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1931         slice_header_t chdr[1];
1932         memset(chdr, 0, sizeof(slice_header_t));
1933         chdr->t = SLICE_TYPE_DEFER_ROW;
1934         chdr->i0 = V->dispatch[0].sub[l].hdr->i0;
1935         chdr->i1 = V->dispatch[0].sub[l].hdr->i1;
1936         chdr->j0 = V->hdr->j0;
1937         chdr->j1 = V->hdr->j1;
1938         chdr->nchildren = 0;
1939         chdr->ncoeffs = 0;
1940         mm->headers.push_back(*chdr);
1941     }
1942 
1943     vector<pair<pair<unsigned int, unsigned int>, pair<uint64_t, uint64_t> > > csizes;
1944 
1945     // unsigned int o8 = mm->t8.size();
1946 
1947     /* dispatch data goes in dispatching order */
1948     for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
1949         vsc_middle_slice_t& D(V->dispatch[k]);
1950 
1951         /* The stream of u16 values corresponding to this strip */
1952         for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1953             vsc_sub_slice_t & S(D.sub[l]);
1954             // printf("save dispatch(%zu), sum=%x\n", S.x.size(), idiotic_sum((void*) ptrbegin(S.x), S.x.size() * sizeof(uint16_t)));
1955             mm->t16.insert(mm->t16.end(), S.x.begin(), S.x.end());
1956             S.x.clear();
1957         }
1958         /* Now the combining data */
1959         for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1960             if (!flush_here(k, V->dispatch.size(), V->steps[l].defer))
1961                 continue;
1962             unsigned int k0 = k - (k % V->steps[l].defer);
1963             /* Think about the combining header ! */
1964             slice_header_t chdr[1];
1965             memset(chdr, 0, sizeof(slice_header_t));
1966             chdr->t = SLICE_TYPE_DEFER_CMB;
1967             chdr->i0 = V->dispatch[k0].sub[l].hdr->i0;
1968             chdr->i1 = V->dispatch[k0].sub[l].hdr->i1;
1969             chdr->j0 = V->dispatch[k0].sub[l].hdr->j0;
1970             chdr->j1 = V->dispatch[k].sub[l].hdr->j1;
1971             chdr->nchildren = 0;
1972             chdr->ncoeffs = 0;
1973             /* flush this one */
1974             vsc_sub_slice_t & S(V->dispatch[k].sub[l]);
1975             for( ; k0 <= k ; k0++) {
1976                 chdr->ncoeffs += V->dispatch[k0].sub[l].hdr->ncoeffs;
1977             }
1978             unsigned int defer = V->steps[l].defer;
1979 
1980             ASSERT_ALWAYS(S.c.size() == chdr->ncoeffs + V->steps[l].nrows);
1981 
1982             csizes.push_back(make_pair(
1983                         make_pair(k - (k % defer), l),
1984                         make_pair(0, S.c.size())));
1985             mm->headers[cidx + l].ncoeffs += chdr->ncoeffs;
1986             // printf("save combine(%zu), sum=%x\n", S.c.size(), idiotic_sum((void*) ptrbegin(S.c), S.c.size()));
1987             // printf("m=%u\n", *max_element(S.c.begin(), S.c.end()));
1988             append_compressed(mm->t8, S.c, defer);
1989             S.c.clear();
1990             mm->headers.push_back(*chdr);
1991         }
1992     }
1993 
1994     /* This fixes the reading order w.r.t. the positioning and size of
1995      * the combining slices */
1996     uint64_t p8 = 0;
1997     for(unsigned int i = 0 ; i < csizes.size() ; i++) {
1998         // unsigned int k = csizes[i].first.first;
1999         unsigned int l = csizes[i].first.second;
2000         unsigned long s = csizes[i].second.second;
2001         unsigned int defer = V->steps[l].defer;
2002         unsigned long count = compressed_size(s, defer);
2003         /*
2004         printf("straight-order (defer %u, vstrip #%u)"
2005                 ": @%" PRIu64 " combine(%" PRIu64 "), sum=%x\n",
2006                 V->steps[csizes[i].first.second].defer, csizes[i].first.first,
2007                 csizes[i].second.first, csizes[i].second.second,
2008                 idiotic_sum((void*) &*(mm->t8.begin() + o8 + csizes[i].second.first), csizes[i].second.second));
2009                 */
2010         csizes[i].second.first = p8;
2011         p8 += count;
2012     }
2013     sort(csizes.begin(), csizes.end());
2014     mm->aux.push_back(2 * csizes.size() + 1);
2015     for(unsigned int i = 0 ; i < csizes.size() ; i++) {
2016         /*
2017         printf("straight-order (defer %u, vstrip #%u)"
2018                 ": @%" PRIu64 " combine(%" PRIu64 "), sum=%x\n",
2019                 V->steps[csizes[i].first.second].defer, csizes[i].first.first,
2020                 csizes[i].second.first, csizes[i].second.second,
2021                 idiotic_sum((void*) &*(mm->t8.begin() + o8 + csizes[i].second.first), csizes[i].second.second));
2022                 */
2023         mm->aux.push_back(csizes[i].second.first);
2024         mm->aux.push_back(csizes[i].second.second);
2025     }
2026     mm->aux.push_back(p8);
2027 
2028     mm->public_->ncoeffs += V->hdr->ncoeffs;
2029 }
2030 /* }}} */
2031 
MATMUL_NAME(build_cache)2032 void MATMUL_NAME(build_cache)(matmul_ptr mm0, uint32_t * data, size_t size MAYBE_UNUSED)
2033 {
2034     struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
2035     builder mb[1];
2036     builder_init(mb, mm, data);
2037 
2038     verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
2039             "%u rows %u cols\n", mm->public_->dim[0], mm->public_->dim[1]);
2040 
2041     uint32_t main_i0 = 0;
2042     uint32_t fence = mb->nrows_t;
2043     if (mm->methods.small1 || mm->methods.small2)
2044         builder_do_all_small_slices(mb, &main_i0, fence, mm->npack);
2045 
2046     if (mm->methods.large)
2047         builder_do_all_large_slices(mb, &main_i0, fence, mm->scratch1size);
2048 
2049     /* Note that vsc and huge are exclusive ! */
2050     if (mm->methods.vsc && main_i0 < fence) {
2051         vsc_slice_t V[1];
2052         builder_prepare_vsc_slices(mb, V, main_i0, fence);
2053         mm->scratch3size = MAX(mm->scratch3size, V->tbuf_space);
2054         builder_push_vsc_slices(mm, V);
2055         main_i0 = fence;
2056     }
2057     if (mm->methods.huge && main_i0 < fence) {
2058         builder_do_all_huge_slices(mb, &main_i0, fence, mm->scratch2size);
2059     }
2060     if (main_i0 < fence) {
2061         fprintf(stderr, "ARGH ! only created a submatrix (%" PRIu32 " < %" PRIu32 ") !!\n", main_i0, fence);
2062         exit(1);
2063     }
2064 
2065 
2066     /* done, at last ! */
2067 
2068     builder_clear(mb);
2069 
2070     mm_finish_init(mm);
2071 }
2072 
MATMUL_NAME(reload_cache)2073 int MATMUL_NAME(reload_cache)(matmul_ptr mm0)/* {{{ */
2074 {
2075     struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
2076     FILE * f;
2077 
2078     f = matmul_common_reload_cache_fopen(sizeof(abelt), mm->public_, MM_MAGIC);
2079     if (!f) return 0;
2080 
2081     for( ;; ) {
2082         slice_header_t hdr[1];
2083         MATMUL_COMMON_READ_ONE16(hdr->t, f);
2084         MATMUL_COMMON_READ_ONE16(hdr->nchildren, f);
2085         MATMUL_COMMON_READ_MANY8(hdr->pad, 4, f);
2086         MATMUL_COMMON_READ_ONE32(hdr->i0, f);
2087         MATMUL_COMMON_READ_ONE32(hdr->i1, f);
2088         MATMUL_COMMON_READ_ONE32(hdr->j0, f);
2089         MATMUL_COMMON_READ_ONE32(hdr->j1, f);
2090         MATMUL_COMMON_READ_ONE64(hdr->ncoeffs, f);
2091         if (hdr->t == SLICE_TYPE_NONE)
2092             break;
2093         mm->headers.push_back(*hdr);
2094     }
2095 
2096     MATMUL_COMMON_READ_ONE32(mm->scratch1size, f);
2097     MATMUL_COMMON_READ_ONE32(mm->scratch2size, f);
2098     MATMUL_COMMON_READ_ONE32(mm->scratch3size, f);
2099 
2100     size_t n16, n8, naux;
2101     MATMUL_COMMON_READ_ONE32(n16, f);
2102     MATMUL_COMMON_READ_ONE32(n8, f);
2103     MATMUL_COMMON_READ_ONE32(naux, f);
2104     mm->t16.resize(n16);
2105     mm->t8.resize(n8);
2106     mm->aux.resize(naux);
2107     MATMUL_COMMON_READ_MANY16(ptrbegin(mm->t16), n16, f);
2108     MATMUL_COMMON_READ_MANY8(ptrbegin(mm->t8), n8, f);
2109     MATMUL_COMMON_READ_MANY32(ptrbegin(mm->aux), naux, f);
2110 
2111     fclose(f);
2112 
2113     mm_finish_init(mm);
2114 
2115     return 1;
2116 }/*}}}*/
2117 
MATMUL_NAME(save_cache)2118 void MATMUL_NAME(save_cache)(matmul_ptr mm0)/*{{{*/
2119 {
2120     struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
2121     FILE * f;
2122 
2123     f = matmul_common_save_cache_fopen(sizeof(abelt), mm->public_, MM_MAGIC);
2124     if (!f) return;
2125 
2126     for(unsigned int h = 0 ; h < mm->headers.size() ; h++) {
2127         slice_header_t * hdr = & (mm->headers[h]);
2128         MATMUL_COMMON_WRITE_ONE16(hdr->t, f);
2129         MATMUL_COMMON_WRITE_ONE16(hdr->nchildren, f);
2130         MATMUL_COMMON_WRITE_MANY8(hdr->pad, 4, f);
2131         MATMUL_COMMON_WRITE_ONE32(hdr->i0, f);
2132         MATMUL_COMMON_WRITE_ONE32(hdr->i1, f);
2133         MATMUL_COMMON_WRITE_ONE32(hdr->j0, f);
2134         MATMUL_COMMON_WRITE_ONE32(hdr->j1, f);
2135         MATMUL_COMMON_WRITE_ONE64(hdr->ncoeffs, f);
2136     }
2137     /* useful padding */
2138     MATMUL_COMMON_WRITE_ONE16(0, f);
2139     MATMUL_COMMON_WRITE_ONE16(0, f);
2140     MATMUL_COMMON_WRITE_ONE32(0, f);    // nchildren+padding
2141     MATMUL_COMMON_WRITE_ONE32(0, f);
2142     MATMUL_COMMON_WRITE_ONE32(0, f);
2143     MATMUL_COMMON_WRITE_ONE32(0, f);
2144     MATMUL_COMMON_WRITE_ONE32(0, f);
2145     MATMUL_COMMON_WRITE_ONE64(0, f);
2146 
2147     MATMUL_COMMON_WRITE_ONE32(mm->scratch1size, f);
2148     MATMUL_COMMON_WRITE_ONE32(mm->scratch2size, f);
2149     MATMUL_COMMON_WRITE_ONE32(mm->scratch3size, f);
2150     size_t n16 = mm->t16.size();
2151     size_t n8 = mm->t8.size();
2152     size_t naux = mm->aux.size();
2153     MATMUL_COMMON_WRITE_ONE32(n16, f);
2154     MATMUL_COMMON_WRITE_ONE32(n8, f);
2155     MATMUL_COMMON_WRITE_ONE32(naux, f);
2156     MATMUL_COMMON_WRITE_MANY16(ptrbegin(mm->t16), n16, f);
2157     MATMUL_COMMON_WRITE_MANY8(ptrbegin(mm->t8), n8, f);
2158     MATMUL_COMMON_WRITE_MANY32(ptrbegin(mm->aux), naux, f);
2159 
2160     fclose(f);
2161 }/*}}}*/
2162 
2163 // static inline uint32_t read32(uint16_t const * & q) /* {{{ */
2164 // {
2165 //     uint32_t res;
2166 //     res = *q++;
2167 //     res |= ((uint32_t) *q++) << 16;
2168 //     return res;
2169 // } /* }}} */
2170 
2171 #ifndef MATMUL_SUB_SMALL1_H_
matmul_sub_small1(abdst_field x,abelt * where,abelt const * from,const uint16_t * q,unsigned int count)2172 static const uint16_t * matmul_sub_small1(abdst_field x, abelt * where, abelt const * from, const uint16_t * q, unsigned int count)
2173 {
2174         for(uint32_t c = 0 ; c < count ; c++) {
2175             uint32_t j = *q++;
2176             uint32_t di = *q++;
2177             // ASSERT(j < (1UL << 16));
2178             // ASSERT(hdr->j0 + j < hdr->j1);
2179             // ASSERT(hdr->i0 + di < hdr->i1);
2180             abadd(x, where[di], where[di], from[j]);
2181         }
2182         return q;
2183 }
2184 #endif
2185 
2186 #ifndef MATMUL_SUB_SMALL1_TR_H_
matmul_sub_small1_tr(abdst_field x,abelt * where,abelt const * from,const uint16_t * q,unsigned int count)2187 static const uint16_t * matmul_sub_small1_tr(abdst_field x, abelt * where, abelt const * from, const uint16_t * q, unsigned int count)
2188 {
2189     for(uint32_t c = 0 ; c < count ; c++) {
2190         uint32_t j = *q++;
2191         uint32_t di = *q++;
2192         // ASSERT(j < (1UL << 16));
2193         // ASSERT(hdr->j0 + j < hdr->j1);
2194         // ASSERT(hdr->i0 + di < hdr->i1);
2195         abadd(x, where[j], where[j], from[di]);
2196     }
2197     return q;
2198 }
2199 #endif
2200 
2201 #ifndef MATMUL_SUB_SMALL2_H_
matmul_sub_small2(abdst_field x,abelt * where,abelt const * from,const uint16_t * q,unsigned int count)2202 static const uint16_t * matmul_sub_small2(abdst_field x, abelt * where, abelt const * from, const uint16_t * q, unsigned int count)
2203 {
2204     uint32_t j = 0;
2205     for(uint32_t c = 0 ; c < count ; c++) {
2206         uint16_t h = *q++;
2207         j += h >> SMALL_SLICES_I_BITS;
2208         // ASSERT(j < pos->ncols_t);
2209         uint32_t di = h & ((1UL << SMALL_SLICES_I_BITS) - 1);
2210         // ASSERT(hdr->j0 + j < hdr->j1);
2211         // ASSERT(hdr->i0 + di < hdr->i1);
2212         abadd(x, where[di], where[di], from[j]);
2213     }
2214     return q;
2215 }
2216 #endif
2217 
2218 #ifndef MATMUL_SUB_SMALL2_TR_H_
matmul_sub_small2_tr(abdst_field x,abelt * where,abelt const * from,const uint16_t * q,unsigned int count)2219 static const uint16_t * matmul_sub_small2_tr(abdst_field x, abelt * where, abelt const * from, const uint16_t * q, unsigned int count)
2220 {
2221     uint32_t j = 0;
2222     for(uint32_t c = 0 ; c < count ; c++) {
2223         uint16_t h = *q++;
2224         j += h >> SMALL_SLICES_I_BITS;
2225         // ASSERT(j < pos->ncols_t);
2226         uint32_t di = h & ((1UL << SMALL_SLICES_I_BITS) - 1);
2227         // ASSERT(hdr->j0 + j < hdr->j1);
2228         // ASSERT(hdr->i0 + di < hdr->i1);
2229         abadd(x, where[j], where[j], from[di]);
2230     }
2231     return q;
2232 }
2233 #endif
2234 
2235 struct pos_desc {
2236     const uint16_t * q16;
2237     const uint8_t * q8;
2238     const unsigned int * ql;
2239     uint32_t i;
2240     uint32_t nrows_t;
2241     uint32_t ncols_t;
2242 };
2243 
matmul_bucket_mul_small1_vblock(struct matmul_bucket_data_s * mm,slice_header_t * hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2244 static inline void matmul_bucket_mul_small1_vblock(struct matmul_bucket_data_s * mm, slice_header_t * hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2245 {
2246     ASM_COMMENT("multiplication code -- small1 (dense) slices"); /* {{{ */
2247     int usual = d == ! mm->public_->store_transposed;
2248     abdst_field x = mm->xab;
2249     abelt * where      = dst + (usual ? hdr->i0 : hdr->j0);
2250     abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2251     if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2252         abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2253     }
2254 
2255     uint32_t ncoeffs_slice = hdr->ncoeffs;
2256     ASSERT_ALWAYS(pos->i == hdr->i0);   // FIXME -- should disappear.
2257 
2258     if (usual) {
2259         pos->q16 = matmul_sub_small1(x, where, from, pos->q16, ncoeffs_slice);
2260     } else {
2261         pos->q16 = matmul_sub_small1_tr(x, where, from, pos->q16, ncoeffs_slice);
2262     }
2263     ASM_COMMENT("end of small1 (dense) slices"); /* }}} */
2264 }
2265 
matmul_bucket_mul_small2(struct matmul_bucket_data_s * mm,slice_header_t * hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2266 static inline void matmul_bucket_mul_small2(struct matmul_bucket_data_s * mm, slice_header_t * hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2267 {
2268     ASM_COMMENT("multiplication code -- small2 (dense) slices"); /* {{{ */
2269     int usual = d == ! mm->public_->store_transposed;
2270     abdst_field x = mm->xab;
2271     abelt * where      = dst + (usual ? hdr->i0 : hdr->j0);
2272     abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2273 
2274     if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2275         abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2276     }
2277 
2278     uint32_t ncoeffs_slice = hdr->ncoeffs;
2279     ASSERT_ALWAYS(pos->i == hdr->i0);   // FIXME -- should disappear.
2280 
2281     if (usual) {
2282         pos->q16 = matmul_sub_small2(x, where, from, pos->q16, ncoeffs_slice);
2283     } else {
2284         pos->q16 = matmul_sub_small2_tr(x, where, from, pos->q16, ncoeffs_slice);
2285     }
2286     /* fix alignment in any case */
2287     pos->q16 += (2 - 1) & - ncoeffs_slice;
2288     ASM_COMMENT("end of small2 (dense) slices"); /* }}} */
2289 }
2290 
2291 // static inline void prepare_buckets_and_fences(abdst_field x MAYBE_UNUSED, abelt ** b, abelt ** f, abelt * z, const unsigned int * ql, unsigned int n)
2292 // {
2293 //     for(unsigned int k = 0 ; k < n ; k++) {
2294 //         b[k] = z;
2295 //         f[k] = b[k] + ql[k];
2296 //         z += ql[k];
2297 //     }
2298 // }
2299 
prepare_buckets(abdst_field x MAYBE_UNUSED,abelt ** b,abelt * z,const unsigned int * ql,unsigned int n)2300 static inline void prepare_buckets(abdst_field x MAYBE_UNUSED, abelt ** b, abelt * z, const unsigned int * ql, unsigned int n)
2301 {
2302     for(unsigned int k = 0 ; k < n ; k++) {
2303         b[k] = z;
2304         z += ql[k];
2305     }
2306 }
2307 
2308 #ifndef MATMUL_SUB_LARGE_FBI_H_
matmul_sub_large_fbi(abdst_field x,abelt ** sb,const abelt * z,const uint8_t * q,unsigned int n)2309 static inline void matmul_sub_large_fbi(abdst_field x, abelt ** sb, const abelt * z, const uint8_t * q, unsigned int n)
2310 {
2311     /* Dispatch data found in z[0]...z[f(n-1)] such that z[f(i)] is in
2312      * array pointed to by sb[q[2*i+1]]. The function f(i) is given by
2313      * the sum q[0]+q[2]+...+q[2*(i-1)]. Exactly 2n coefficients are
2314      * expected in q[] All the sb[] pointers are increased */
2315     for(unsigned int c = 0 ; c < n ; c++) {
2316         z += *q;
2317         // we might receive zmax and do some checking (see caller)
2318         // ASSERT_ALWAYS(z < zmax);
2319         q++;
2320         abset(x, sb[*q][0], z[0]);
2321         sb[*q]+= 1;
2322         q++;
2323     }
2324 }
2325 #endif
2326 
2327 #ifndef MATMUL_SUB_LARGE_FBI_TR_H_
2328 static inline void
matmul_sub_large_fbi_tr(abdst_field x,abelt ** sb,abelt * z,const uint8_t * q,unsigned int n)2329 matmul_sub_large_fbi_tr(abdst_field x, abelt ** sb, abelt * z, const uint8_t * q, unsigned int n)
2330 {
2331     /* Does the converse of the above */
2332     for(unsigned int c = 0 ; c < n ; c++) {
2333         z += *q;
2334         q++;
2335         abadd(x, z[0], z[0], sb[*q][0]);
2336         sb[*q]+= 1;
2337         q++;
2338     }
2339 }
2340 #endif
2341 
2342 #ifndef MATMUL_SUB_LARGE_ASB_H_
2343 // static void matmul_sub_large_asb(abdst_field x, abelt * dst, const abelt * z, const uint8_t * q, const unsigned int * ql) __attribute__((__noinline__));
matmul_sub_large_asb(abdst_field x,abelt * dst,const abelt * z,const uint8_t * q,const unsigned int * ql)2344 static void matmul_sub_large_asb(abdst_field x, abelt * dst, const abelt * z, const uint8_t * q, const unsigned int * ql)
2345 {
2346     /* This ``applies'' the LSL_NBUCKETS_MAX small buckets whose
2347      * respective lengths are given by ql[0] to ql[LSL_NBUCKETS_MAX-1].
2348      *
2349      * For ql[k] <= i < ql[k+1], z[i] is added to dst[k*256+qk[i]], with
2350      * qk = q + ql[0] + ... + ql[k-1].
2351      */
2352     for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
2353         unsigned int l = ql[k];
2354         for( ; l-- ; ) {
2355             /* For padding coeffs, the assertion can fail if
2356              * we choose a row not equal to (0,0) -- first in
2357              * the first bucket.
2358              */
2359             abadd(x, dst[*q], dst[*q], z[0]);
2360             z+= 1;
2361             q++;
2362         }
2363         dst += 256;
2364     }
2365 }
2366 #endif
2367 
2368 #ifndef MATMUL_SUB_LARGE_ASB_TR_H_
matmul_sub_large_asb_tr(abdst_field x,const abelt * src,abelt * z,const uint8_t * q,const unsigned int * ql)2369 static inline void matmul_sub_large_asb_tr(abdst_field x, const abelt * src, abelt * z, const uint8_t * q, const unsigned int * ql)
2370 {
2371     /* converse of the above */
2372     for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
2373         unsigned int l = ql[k];
2374         for( ; l-- ; ) {
2375             /* For padding coeffs, the assertion can fail if
2376              * we choose a row not equal to (0,0) -- first in
2377              * the first bucket.
2378              */
2379             abset(x, z[0], src[*q]);
2380             z += 1;
2381             q++;
2382         }
2383         src += 256;
2384     }
2385 }
2386 #endif
2387 
2388 #ifndef MATMUL_SUB_LARGE_FBD_H_
2389 static void
matmul_sub_large_fbd(abdst_field x,abelt ** sb,const abelt * z,const uint8_t * q,unsigned int n)2390 matmul_sub_large_fbd(abdst_field x, abelt ** sb, const abelt * z, const uint8_t * q, unsigned int n)
2391 {
2392     /* Dispatch data found in z[0]...z[n] such that z[i] is in array
2393      * pointed to by sb[q[i]]. Exactly n coefficients are expected. All
2394      * the sb[] pointers are increased */
2395     for(unsigned int c = 0 ; c < n ; c++) {
2396         abset(x, sb[q[c]][0], z[0]);
2397         sb[q[c]]+= 1;
2398         z += 1;
2399     }
2400 }
2401 #endif
2402 
2403 #ifndef MATMUL_SUB_LARGE_FBD_TR_H_
2404 static inline void
matmul_sub_large_fbd_tr(abdst_field x,abelt ** sb,abelt * z,const uint8_t * q,unsigned int n)2405 matmul_sub_large_fbd_tr(abdst_field x, abelt ** sb, abelt * z, const uint8_t * q, unsigned int n)
2406 {
2407     /* Does the converse of the above */
2408     for(unsigned int c = 0 ; c < n ; c++) {
2409         abset(x, z[0], sb[q[c]][0]);
2410         sb[q[c]]+= 1;
2411         z += 1;
2412     }
2413 }
2414 #endif
2415 
matmul_bucket_mul_large(struct matmul_bucket_data_s * mm,slice_header_t * hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2416 static inline void matmul_bucket_mul_large(struct matmul_bucket_data_s * mm, slice_header_t * hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2417 {
2418     ASM_COMMENT("multiplication code -- large (sparse) slices"); /* {{{ */
2419 
2420     abdst_field x = mm->xab;
2421 
2422     int usual = d == ! mm->public_->store_transposed;
2423 
2424     abelt * where      = dst + (usual ? hdr->i0 : hdr->j0);
2425     abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2426     if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2427         abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2428     }
2429 
2430     abelt * scratch = mm->scratch1;
2431 
2432     uint32_t j = 0;
2433 
2434     if (usual) {
2435         for( ; j < pos->ncols_t ; ) {
2436             uint32_t j1 = j + *pos->ql++;
2437             uint32_t n = *pos->ql++;
2438             abelt * bucket[LSL_NBUCKETS_MAX];
2439             abelt const * inp = from + j;
2440             prepare_buckets(x,bucket,scratch,pos->ql,LSL_NBUCKETS_MAX);
2441             ASSERT_ALWAYS((((unsigned long)pos->q8)&1)==0);
2442             matmul_sub_large_fbi(x, bucket, inp, pos->q8, n);
2443             // the (inp) variable in the call above never exceeds from +
2444             // j1, although we don't do the check in reality.
2445             // matmul_sub_large_fbi_boundschecked(x, bucket, inp, pos->q8, n, from + j1);
2446             matmul_sub_large_asb(x, where, scratch, pos->q8+2*n, pos->ql);
2447             pos->q8 += 3*n;
2448             // fix alignment !
2449             pos->q8 += n & 1;
2450             /* FIXME. LSL_NBUCKETS_MAX is a compile-time constant, but it
2451              * might differ in the computed cache file !!! (occurs here
2452              * and in several other places as well) */
2453             pos->ql += LSL_NBUCKETS_MAX;
2454             j = j1;
2455         }
2456     } else {
2457         for( ; j < pos->ncols_t ; ) {
2458             uint32_t j1 = j + *pos->ql++;
2459             uint32_t n = *pos->ql++;
2460             abelt * bucket[LSL_NBUCKETS_MAX];
2461             abelt * outp = where + j;
2462             prepare_buckets(x,bucket,scratch,pos->ql,LSL_NBUCKETS_MAX);
2463             matmul_sub_large_asb_tr(x, from, scratch, pos->q8+2*n, pos->ql);
2464             ASSERT_ALWAYS((((unsigned long)pos->q8)&1)==0);
2465             matmul_sub_large_fbi_tr(x, bucket, outp, pos->q8, n);
2466             pos->q8 += 3 * n;
2467             // fix alignment !
2468             pos->q8 += n & 1;
2469             pos->ql += LSL_NBUCKETS_MAX;
2470             j = j1;
2471         }
2472     }
2473     ASM_COMMENT("end of large (sparse) slices"); /* }}} */
2474 }
2475 
matmul_bucket_mul_huge(struct matmul_bucket_data_s * mm,slice_header_t * hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2476 static inline void matmul_bucket_mul_huge(struct matmul_bucket_data_s * mm, slice_header_t * hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2477 {
2478     ASM_COMMENT("multiplication code -- huge (very sparse) slices"); /* {{{ */
2479 
2480     abdst_field x = mm->xab;
2481 
2482     int usual = d == ! mm->public_->store_transposed;
2483 
2484     abelt * where      = dst + (usual ? hdr->i0 : hdr->j0);
2485     abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2486     if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2487         abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2488     }
2489 
2490     abelt * scratch = mm->scratch1;
2491 
2492     uint32_t j = 0;
2493 
2494     uint32_t di = hdr->i1 - hdr->i0;
2495     unsigned int nlarge = *pos->ql++;
2496     uint32_t di_sub = iceildiv(di, nlarge);
2497 
2498     if (usual) {
2499         /* ok -- hold your breath a second. */
2500         for( ; j < pos->ncols_t ; ) {
2501             uint32_t j1 = j + *pos->ql++;
2502             unsigned int n = *pos->ql++;
2503             ASSERT_ALWAYS(n <= mm->scratch2size);
2504             abelt * scratch2 = mm->scratch2;
2505             abelt const * inp = src + j;
2506             abelt * bucket[HUGE_MPLEX_MAX];
2507             const unsigned int * Lsizes = pos->ql;
2508             prepare_buckets(x,bucket,scratch2,pos->ql,nlarge);
2509             pos->ql += nlarge;
2510             ASSERT_ALWAYS((((unsigned long)pos->q8)&1)==0);
2511             matmul_sub_large_fbi(x, bucket, inp, pos->q8, n);
2512             pos->q8 += 2 * n;
2513             for(unsigned int k = 0 ; k < nlarge ; k++) {
2514                 abelt * sbucket[LSL_NBUCKETS_MAX];
2515                 prepare_buckets(x,sbucket,scratch,pos->ql,LSL_NBUCKETS_MAX);
2516                 bucket[k] -= Lsizes[k];
2517                 matmul_sub_large_fbd(x, sbucket, bucket[k], pos->q8, Lsizes[k]);
2518                 pos->q8 += Lsizes[k];
2519                 abelt * outp = where + k * di_sub;
2520                 matmul_sub_large_asb(x, outp, scratch, pos->q8, pos->ql);
2521                 pos->q8 += Lsizes[k];
2522                 pos->ql += LSL_NBUCKETS_MAX;
2523             }
2524             j = j1;
2525         }
2526     } else {
2527         for( ; j < pos->ncols_t ; ) {
2528             uint32_t j1 = j + *pos->ql++;
2529             unsigned int n = *pos->ql++;
2530             abelt * scratch2 = mm->scratch2;
2531             abelt * outp = dst + j;
2532             abelt * bucket[HUGE_MPLEX_MAX];
2533             const unsigned int * Lsizes = pos->ql;
2534             prepare_buckets(x,bucket,scratch2,pos->ql,nlarge);
2535             pos->ql += nlarge;
2536             const uint8_t * q8_saved = pos->q8;
2537             pos->q8 += 2 * n;
2538             for(unsigned int k = 0 ; k < nlarge ; k++) {
2539                 abelt * sbucket[LSL_NBUCKETS_MAX];
2540                 prepare_buckets(x,sbucket,scratch,pos->ql,LSL_NBUCKETS_MAX);
2541                 const uint8_t * fill = pos->q8;
2542                 const uint8_t * apply = pos->q8 + Lsizes[k];
2543                 const abelt * inp = from + k * di_sub;
2544                 matmul_sub_large_asb_tr(x, inp, scratch, apply, pos->ql);
2545 
2546                 matmul_sub_large_fbd_tr(x, sbucket, bucket[k], fill, Lsizes[k]);
2547 
2548                 pos->q8 += 2 * Lsizes[k];
2549                 pos->ql += LSL_NBUCKETS_MAX;
2550             }
2551             swap(pos->q8, q8_saved);
2552             ASSERT_ALWAYS((((unsigned long)pos->q8)&1)==0);
2553             matmul_sub_large_fbi_tr(x, bucket, outp, pos->q8, n);
2554             pos->q8 += 2 * n;
2555             swap(pos->q8, q8_saved);
2556             j = j1;
2557         }
2558     }
2559     ASM_COMMENT("end of huge (very sparse) slices"); /* }}} */
2560 }
2561 
2562 #ifndef MATMUL_SUB_VSC_DISPATCH_H
matmul_sub_vsc_dispatch(abdst_field x,abelt * dst,abelt const * src,const uint16_t * q,unsigned int count)2563 static inline void matmul_sub_vsc_dispatch(abdst_field x, abelt * dst, abelt const * src, const uint16_t * q, unsigned int count)
2564 {
2565     // printf("dispatch(%u), sum=%x\n", count, idiotic_sum((void*)q, count * sizeof(uint16_t)));
2566     for( ; count-- ; ) {
2567         abset(x, dst[0], src[*q++]);
2568         dst += 1;
2569     }
2570 }
2571 #endif
2572 
2573 #ifndef MATMUL_SUB_VSC_COMBINE_H_
matmul_sub_vsc_combine(abdst_field x,abelt * dst,const abelt ** mptrs,const uint8_t * q,unsigned int count,unsigned int defer MAYBE_UNUSED)2574 static inline void matmul_sub_vsc_combine(abdst_field x, abelt * dst, const abelt * * mptrs, const uint8_t * q, unsigned int count, unsigned int defer MAYBE_UNUSED)
2575 {
2576     // printf("combine(%u), defer %u\n", count, defer);
2577     // printf("combine(%u), sum=%x\n", count, idiotic_sum((void*)q, compressed_size(count, defer)));
2578     if (0) {
2579 #ifdef COMPRESS_COMBINERS_1
2580     } else if (defer == 1) {
2581         const unsigned int nbits = 1;
2582         for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2583             uint8_t wx = *q++;
2584             for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2585                 uint8_t c = wx & ((1 << nbits) - 1);
2586                 ASSERT(c <= defer);
2587                 wx >>= nbits;
2588                 abadd(x, dst[0], dst[0], mptrs[c][0]);
2589                 mptrs[c] += c != 0;
2590                 dst += c == 0;
2591             }
2592         }
2593 #endif
2594 #ifdef COMPRESS_COMBINERS_2
2595     } else if (defer <= 3) {
2596         const unsigned int nbits = 2;
2597         for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2598             uint8_t wx = *q++;
2599             for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2600                 uint8_t c = wx & ((1 << nbits) - 1);
2601                 ASSERT(c <= defer);
2602                 wx >>= nbits;
2603                 abadd(x, dst[0], dst[0], mptrs[c][0]);
2604                 mptrs[c] += c != 0;
2605                 dst += c == 0;
2606             }
2607         }
2608 #endif
2609 #ifdef COMPRESS_COMBINERS_4
2610     } else if (defer <= 15) {
2611         const unsigned int nbits = 4;
2612         for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2613             uint8_t wx = *q++;
2614             for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2615                 uint8_t c = wx & ((1 << nbits) - 1);
2616                 ASSERT(c <= defer);
2617                 wx >>= nbits;
2618                 abadd(x, dst[0], dst[0], mptrs[c][0]);
2619                 mptrs[c] += c != 0;
2620                 dst += c == 0;
2621             }
2622         }
2623 #endif
2624     } else {
2625         for( ; count-- ; ) {
2626             uint8_t c = *q++;
2627                 ASSERT(c <= defer);
2628             abadd(x, dst[0], dst[0], mptrs[c][0]);
2629             mptrs[c] += c != 0;
2630             dst += c == 0;
2631         }
2632     }
2633 }
2634 #endif
2635 
2636 #ifndef MATMUL_SUB_VSC_COMBINE_TR_H_
matmul_sub_vsc_combine_tr(abdst_field x,abelt ** mptrs,const abelt * qw,const uint8_t * z,unsigned int count,unsigned int defer MAYBE_UNUSED)2637 static inline void matmul_sub_vsc_combine_tr(abdst_field x, abelt ** mptrs, const abelt * qw, const uint8_t * z, unsigned int count, unsigned int defer MAYBE_UNUSED)
2638 {
2639     // printf("uncombine(%u), defer %u\n", count, defer);
2640     // printf("uncombine(%u), sum=%x\n", count, idiotic_sum((void*)z, compressed_size(count, defer)));
2641     if (0) {
2642 #ifdef COMPRESS_COMBINERS_1
2643     } else if (defer == 1) {
2644         const unsigned int nbits = 1;
2645         for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2646             uint8_t wx = *z++;
2647             for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2648                 uint8_t c = wx & ((1 << nbits) - 1);
2649                 ASSERT(c <= defer);
2650                 wx >>= nbits;
2651                 abset(x, mptrs[c][0], qw[0]);
2652                 mptrs[c] += c != 0;
2653                 qw += c == 0;
2654             }
2655         }
2656 #endif
2657 #ifdef COMPRESS_COMBINERS_2
2658     } else if (defer <= 3) {
2659         const unsigned int nbits = 2;
2660         for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2661             uint8_t wx = *z++;
2662             for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2663                 uint8_t c = wx & ((1 << nbits) - 1);
2664                 ASSERT(c <= defer);
2665                 wx >>= nbits;
2666                 abset(x, mptrs[c][0], qw[0]);
2667                 mptrs[c] += c != 0;
2668                 qw += c == 0;
2669             }
2670         }
2671 #endif
2672 #ifdef COMPRESS_COMBINERS_4
2673     } else if (defer <= 15) {
2674         const unsigned int nbits = 4;
2675         for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2676             uint8_t wx = *z++;
2677             for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2678                 uint8_t c = wx & ((1 << nbits) - 1);
2679                 ASSERT(c <= defer);
2680                 wx >>= nbits;
2681                 abset(x, mptrs[c][0], qw[0]);
2682                 mptrs[c] += c != 0;
2683                 qw += c == 0;
2684             }
2685         }
2686 #endif
2687     } else {
2688         for( ; count-- ; ) {
2689             uint8_t c = *z++;
2690             ASSERT(c <= defer);
2691             abset(x, mptrs[c][0], qw[0]);
2692             mptrs[c] += c != 0;
2693             qw += c == 0;
2694         }
2695     }
2696 }
2697 #endif
2698 
2699 #ifndef MATMUL_SUB_VSC_DISPATCH_TR_H_
matmul_sub_vsc_dispatch_tr(abdst_field x,abelt * qr,const abelt * q,const uint16_t * z,unsigned int count)2700 static inline void matmul_sub_vsc_dispatch_tr(abdst_field x, abelt * qr, const abelt * q, const uint16_t * z, unsigned int count)
2701 {
2702     // printf("undispatch(%u), sum=%x\n", count, idiotic_sum((void*)z, count * sizeof(uint16_t)));
2703     for( ; count-- ; ) {
2704         abadd(x, qr[*z], qr[*z], q[0]);
2705         z++;
2706         q += 1;
2707     }
2708 }
2709 #endif
2710 
2711 /* in preparation for different steps, including the matrix
2712  * multiplication itself, we are rebuilding the vsc_slice_t object, or at
2713  * least its skeleton ; it makes it easy to run the control loops.  The
2714  * real data payload is of course not copied again ! */
2715 
rebuild_vsc_slice_skeleton(struct matmul_bucket_data_s * mm,vector<slice_header_t>::iterator & hdr,vsc_slice_t * V,struct pos_desc * pos,unsigned int & nvstrips,unsigned int & Midx,unsigned int & Didx,unsigned int & Ridx,unsigned int & Cidx)2716 static inline void rebuild_vsc_slice_skeleton(
2717         struct matmul_bucket_data_s * mm,
2718         vector<slice_header_t>::iterator & hdr,
2719         vsc_slice_t * V, struct pos_desc * pos,
2720         unsigned int & nvstrips,
2721         unsigned int & Midx,
2722         unsigned int & Didx,
2723         unsigned int & Ridx,
2724         unsigned int & Cidx)
2725 {
2726     *V->hdr = *hdr++;
2727     nvstrips = *pos->ql++;
2728     V->dispatch.resize(nvstrips);
2729     V->steps.resize(*pos->ql++);
2730     for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2731         V->steps[l].defer = *pos->ql++;
2732         V->steps[l].tbuf_space = *pos->ql++;
2733     }
2734 
2735     Midx = hdr - mm->headers.begin();
2736     for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2737         vsc_middle_slice_t &D (V->dispatch[k]);
2738         *D.hdr = *hdr++;
2739     }
2740     Didx = hdr - mm->headers.begin();
2741     for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2742         vsc_middle_slice_t &D (V->dispatch[k]);
2743         D.sub.resize(V->steps.size());
2744         for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2745             vsc_sub_slice_t & S(D.sub[l]);
2746             *S.hdr = *hdr++;
2747             V->steps[l].nrows = S.hdr->i1 - S.hdr->i0;
2748         }
2749     }
2750     /* Skip over the combining headers as well */
2751     Ridx = hdr - mm->headers.begin();
2752     for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2753         hdr++;
2754     }
2755     Cidx = hdr - mm->headers.begin();
2756     for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2757         for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2758             if (!flush_here(k, V->dispatch.size(), V->steps[l].defer))
2759                 continue;
2760             hdr++;
2761         }
2762     }
2763 }
matmul_bucket_mul_vsc(struct matmul_bucket_data_s * mm,vector<slice_header_t>::iterator & hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2764 static inline void matmul_bucket_mul_vsc(struct matmul_bucket_data_s * mm, vector<slice_header_t>::iterator & hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2765 {
2766     abdst_field x = mm->xab;
2767 
2768     int usual = d == ! mm->public_->store_transposed;
2769 
2770     abelt * where      = dst + (usual ? hdr->i0 : hdr->j0);
2771     // abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2772     if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2773         abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2774     }
2775 
2776     abelt * scratch = mm->scratch3;
2777 
2778     /* {{{ */
2779 
2780     vsc_slice_t V[1];
2781     unsigned int nvstrips;
2782     unsigned int Midx;
2783     unsigned int Didx;
2784     unsigned int Ridx;
2785     unsigned int Cidx;
2786     rebuild_vsc_slice_skeleton(mm, hdr, V, pos, nvstrips, Midx, Didx, Ridx, Cidx);
2787 
2788     /*}}}*/
2789 
2790     ASM_COMMENT("multiplication code -- vertical staircase");/*{{{*/
2791 
2792     /* now prepare pointers */
2793     vector<abelt *> base_ptrs;
2794     vector<abelt *> cptrs;
2795     abelt * q0 = scratch;
2796     abelt * dummy = q0;
2797     abvec_set_zero(x, dummy, 1);
2798     q0++;
2799     for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2800         base_ptrs.push_back(q0);
2801         cptrs.push_back(q0);
2802         q0 += V->steps[l].tbuf_space;
2803     }
2804     base_ptrs.push_back(q0);
2805 
2806     if (usual) {
2807         uint32_t skipover = *pos->ql++;
2808         pos->ql += skipover;
2809         for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2810             vsc_middle_slice_t const & D(V->dispatch[k]);
2811             const abelt * qr = src + D.hdr->j0;
2812             mm->slice_timings[Midx].t -= wct_seconds();
2813             for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2814                 vsc_sub_slice_t const & S(D.sub[l]);
2815                 abelt * q = cptrs[l];
2816                 unsigned int count = S.hdr->ncoeffs;
2817                 ASSERT(base_ptrs[l] <= q);
2818                 ASSERT(q <= base_ptrs[l+1]);
2819                 mm->slice_timings[Didx].t -= wct_seconds();
2820                 matmul_sub_vsc_dispatch(x, q, qr, pos->q16, count);
2821                 q += count;
2822                 pos->q16 += count;
2823                 mm->slice_timings[Didx].t += wct_seconds();
2824                 Didx++;
2825                 ASSERT(q <= base_ptrs[l+1]);
2826                 cptrs[l] = q;
2827             }
2828             mm->slice_timings[Midx].t += wct_seconds();
2829             Midx++;
2830 
2831             for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2832                 vsc_sub_slice_t const & S(D.sub[l]);
2833                 unsigned int defer = V->steps[l].defer;
2834                 if (!flush_here(k,nvstrips,defer))
2835                     continue;
2836 
2837                 /* our different read pointers */
2838                 vector<abelt const *> mptrs;
2839                 mptrs.reserve(defer + 1);
2840                 abelt const * q = base_ptrs[l];
2841                 mptrs.push_back(dummy);
2842                 for(unsigned int k0 = k - k % defer ; k0 <= k ; k0++) {
2843                     mptrs.push_back(q);
2844                     q += V->dispatch[k0].sub[l].hdr->ncoeffs;
2845                 }
2846 
2847                 abelt * qw = dst + S.hdr->i0;
2848                 unsigned int count = mm->headers[Cidx].ncoeffs + V->steps[l].nrows;
2849                 ASSERT(V->steps[l].nrows == S.hdr->i1 - S.hdr->i0);
2850                 ASSERT(q - base_ptrs[l] == (ptrdiff_t) mm->headers[Cidx].ncoeffs);
2851 
2852                 double t = wct_seconds();
2853                 mm->slice_timings[Cidx].t -= t;
2854                 mm->slice_timings[Ridx+l].t -= t;
2855                 matmul_sub_vsc_combine(x, qw, ptrbegin(mptrs), pos->q8, count, defer);
2856                 pos->q8 += compressed_size(count, defer);
2857                 t = wct_seconds();
2858                 mm->slice_timings[Cidx].t += t;
2859                 mm->slice_timings[Ridx+l].t += t;
2860                 Cidx++;
2861                 cptrs[l]=base_ptrs[l];
2862             }
2863         }
2864         ASSERT((ptrdiff_t) Cidx == hdr - mm->headers.begin());
2865     } else {
2866         /* There's quite an annoying difficulty here. Combining buffers
2867          * are not read in the same order here, because ``combining'' (whose
2868          * meaning turns into actually dispatching) occurs of course
2869          * _earlier_ in the operation.
2870          */
2871         pos->ql++;      // only the count, for fast skipover.
2872         for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2873             vsc_middle_slice_t const & D(V->dispatch[k]);
2874             abelt * qr = dst + D.hdr->j0;
2875             for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2876                 vsc_sub_slice_t const & S(D.sub[l]);
2877                 unsigned int defer = V->steps[l].defer;
2878 
2879                 /* Here the test is not the same. We have to fill the
2880                  * scratch buffers ahead of time. */
2881                 if (k % defer)
2882                     continue;
2883 
2884                 /* our different _write_ pointers */
2885                 vector<abelt *> mptrs;
2886                 mptrs.reserve(defer + 1);
2887                 abelt * q = base_ptrs[l];
2888                 cptrs[l] = q;
2889                 mptrs.push_back(dummy);
2890                 for(unsigned int k0 = k ; k0 <= when_flush(k,nvstrips,defer) ; k0++) {
2891                     mptrs.push_back(q);
2892                     q += V->dispatch[k0].sub[l].hdr->ncoeffs;
2893                 }
2894 
2895                 const abelt * qw = src + S.hdr->i0;
2896                 // unsigned int count = mm->headers[Cidx].ncoeffs + V->steps[l].nrows;
2897                 const uint8_t * z = pos->q8 + *pos->ql++;
2898                 unsigned int count = *pos->ql++;
2899 
2900                 double t = wct_seconds();
2901                 // mm->slice_timings[Cidx].t -= t;
2902                 mm->slice_timings[Ridx+l].t -= t;
2903                 matmul_sub_vsc_combine_tr(x, ptrbegin(mptrs), qw, z, count, defer);
2904                 z += compressed_size(count, defer);
2905                 t = wct_seconds();
2906                 // mm->slice_timings[Cidx].t += t;
2907                 mm->slice_timings[Ridx+l].t += t;
2908                 // Cidx++;
2909             }
2910 
2911             mm->slice_timings[Midx].t -= wct_seconds();
2912             for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2913                 vsc_sub_slice_t const & S(D.sub[l]);
2914                 abelt * q = cptrs[l];
2915                 unsigned int count = S.hdr->ncoeffs;
2916                 ASSERT(base_ptrs[l] <= q);
2917                 ASSERT(q <= base_ptrs[l+1]);
2918                 mm->slice_timings[Didx].t -= wct_seconds();
2919                 matmul_sub_vsc_dispatch_tr(x, qr, q, pos->q16, count);
2920                 cptrs[l] += count;
2921                 pos->q16 += count;
2922                 mm->slice_timings[Didx].t += wct_seconds();
2923                 Didx++;
2924                 ASSERT(q <= base_ptrs[l+1]);
2925             }
2926             mm->slice_timings[Midx].t += wct_seconds();
2927             Midx++;
2928         }
2929         // ASSERT(Cidx == hdr - mm->headers.begin());
2930         pos->q8 += *pos->ql++;
2931     }
2932 
2933     ASM_COMMENT("end of vertical staircase");/*}}}*/
2934 
2935     /* It's an envelope type, so make sure the iterator is properly
2936      * placed eventually */
2937     hdr--;
2938 }
2939 
2940 ///////////////////////////////////////////////////////////////////////
2941 // just count how many times an iteration schedules a coefficient in
2942 // the fbi/fbd/asb routines.
2943 
mm_finish_init(struct matmul_bucket_data_s * mm)2944 static inline void mm_finish_init(struct matmul_bucket_data_s * mm)
2945 {
2946     struct pos_desc pos[1];
2947 
2948     pos->q16 = ptrbegin(mm->t16);
2949     pos->q8 = ptrbegin(mm->t8);
2950     pos->ql = ptrbegin(mm->aux);
2951     pos->i = 0;
2952     pos->nrows_t = mm->public_->dim[ mm->public_->store_transposed];
2953     pos->ncols_t = mm->public_->dim[!mm->public_->store_transposed];
2954 
2955     /*
2956     for(uint16_t h = 0 ; h < mm->headers.size() ; h++) {
2957         slice_header_t * hdr = & (mm->headers[h]);
2958         printf("block %d %s [%d..%d[ x [%d..%d[, %d cld, %" PRIu64 " coeffs\n",
2959                 h, slice_name(hdr->t),
2960                 hdr->i0, hdr->i1,
2961                 hdr->j0, hdr->j1,
2962                 hdr->nchildren, hdr->ncoeffs);
2963     }
2964     */
2965     for(vector<slice_header_t>::iterator hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
2966         switch(hdr->t) {
2967             case SLICE_TYPE_SMALL1:
2968                 break;
2969             case SLICE_TYPE_SMALL1_VBLOCK:
2970                 // mm->mms1_ncoeffs += hdr->ncoeffs;
2971                 pos->q16 += 2 * hdr->ncoeffs;
2972                 break;
2973             case SLICE_TYPE_SMALL2:
2974                 pos->q16 += hdr->ncoeffs;
2975                 pos->q16 += (2 - 1) & - hdr->ncoeffs;
2976                 // mm->mms2_ncoeffs += hdr->ncoeffs;
2977                 break;
2978             case SLICE_TYPE_LARGE_ENVELOPE:
2979                 {
2980                     uint32_t j = 0;
2981                     for( ; j < pos->ncols_t ; ) {
2982                         uint32_t j1 = j + *pos->ql++;
2983                         uint32_t n = *pos->ql++;
2984                         // mm->fbi_ncoeffs += n;
2985                         // mm->asb_ncoeffs += n;
2986                         pos->q8 += 3*n;
2987                         pos->q8 += n & 1;
2988                         pos->ql += LSL_NBUCKETS_MAX;
2989                         j = j1;
2990                     }
2991                 }
2992                 break;
2993             case SLICE_TYPE_HUGE_ENVELOPE:
2994                 {
2995                     uint32_t j = 0;
2996                     unsigned int nlarge = *pos->ql++;
2997                     for( ; j < pos->ncols_t ; ) {
2998                         uint32_t j1 = j + *pos->ql++;
2999                         unsigned int n = *pos->ql++;
3000                         // mm->fbi_ncoeffs += n;
3001                         // mm->fbd_ncoeffs += n;
3002                         // mm->asb_ncoeffs += n;
3003                         pos->ql += nlarge;
3004                         pos->ql += nlarge * LSL_NBUCKETS_MAX;
3005                         pos->q8 += 4*n;
3006                         j = j1;
3007                     }
3008                 }
3009                 break;
3010             case SLICE_TYPE_DEFER_ENVELOPE:
3011                 {
3012                     vsc_slice_t V[1];
3013                     unsigned int nvstrips;
3014                     unsigned int Midx;
3015                     unsigned int Didx;
3016                     unsigned int Ridx;
3017                     unsigned int Cidx;
3018                     rebuild_vsc_slice_skeleton(mm, hdr, V, pos, nvstrips, Midx, Didx, Ridx, Cidx);
3019                     hdr--;
3020                     /*
3021                     for(unsigned int s = 0 ; s < V->steps.size() ; s++) {
3022                         unsigned int defer = V->steps[s].defer;
3023                         printf("Rows %" PRIu32 "+%" PRIu32, mm->headers[Ridx++].i0, V->steps[s].nrows);
3024                         if (V->steps[s].density_upper_bound != UINT_MAX) {
3025                             printf(": d < %u", V->steps[s].density_upper_bound);
3026                         }
3027                         printf("; %u flushes (every %u), tbuf space %lu\n", iceildiv(nvstrips, defer), defer, V->steps[s].tbuf_space);
3028                     }
3029                     */
3030                 }
3031                 break;
3032             default:
3033                 fprintf(stderr, "Unexpected block %s encountered\n",
3034                         slice_name(hdr->t));
3035                 break;
3036         }
3037         if (hdr->j1 == pos->ncols_t) {
3038             pos->i = hdr->i1;
3039         }
3040     }
3041 
3042 #if 0
3043     for(uint16_t h = 0 ; h < mm->headers.size() ; h++) {
3044         slice_header_t * hdr = & (mm->headers[h]);
3045         ASSERT_ALWAYS(pos->i == hdr->i0);
3046         switch(hdr->t) {
3047             case SLICE_TYPE_SMALL1_VBLOCK:
3048                 mm->mms1_ncoeffs += hdr->ncoeffs;
3049                 pos->q16 += 2 * hdr->ncoeffs;
3050                 break;
3051             case SLICE_TYPE_SMALL2:
3052                 pos->q16 += hdr->ncoeffs;
3053                 pos->q16 += (2 - 1) & - hdr->ncoeffs;
3054                 mm->mms2_ncoeffs += hdr->ncoeffs;
3055                 break;
3056             case SLICE_TYPE_LARGE_ENVELOPE:
3057                 {
3058                     uint32_t j = 0;
3059                     for( ; j < pos->ncols_t ; ) {
3060                         uint32_t j1 = j + *pos->ql++;
3061                         uint32_t n = *pos->ql++;
3062                         mm->fbi_ncoeffs += n;
3063                         mm->asb_ncoeffs += n;
3064                         // pos->q8 += 3*n;
3065                         pos->ql += LSL_NBUCKETS_MAX;
3066                         j = j1;
3067                     }
3068                 }
3069                 break;
3070             case SLICE_TYPE_HUGE_ENVELOPE:
3071                 {
3072                     uint32_t j = 0;
3073                     unsigned int nlarge = *pos->ql++;
3074                     for( ; j < pos->ncols_t ; ) {
3075                         uint32_t j1 = j + *pos->ql++;
3076                         unsigned int n = *pos->ql++;
3077                         mm->fbi_ncoeffs += n;
3078                         mm->fbd_ncoeffs += n;
3079                         mm->asb_ncoeffs += n;
3080                         pos->ql += nlarge;
3081                         pos->ql += nlarge * LSL_NBUCKETS_MAX;
3082                         // pos->q8 += 4*n;
3083                         j = j1;
3084                     }
3085                 }
3086             case SLICE_TYPE_DEFER_ENVELOPE:
3087             default:
3088                 break;
3089         }
3090         if (hdr->j1 == pos->ncols_t) {
3091             pos->i = hdr->i1;
3092         }
3093     }
3094 #endif
3095 
3096     abvec_init(mm->xab, &mm->scratch1, mm->scratch1size);
3097     abvec_init(mm->xab, &mm->scratch2, mm->scratch2size);
3098     abvec_init(mm->xab, &mm->scratch3, mm->scratch3size);
3099 
3100     mm->slice_timings.resize(mm->headers.size());
3101 }
3102 
matmul_bucket_zero_stats(struct matmul_bucket_data_s * mm)3103 static void matmul_bucket_zero_stats(struct matmul_bucket_data_s * mm)
3104 {
3105     vector<slice_header_t>::iterator hdr;
3106     for(hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
3107         unsigned int hidx = hdr - mm->headers.begin();
3108         mm->slice_timings[hidx].t = 0;
3109     }
3110 }
3111 
matmul_bucket_mul_small1(struct matmul_bucket_data_s * mm,vector<slice_header_t>::iterator & hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)3112 static inline void matmul_bucket_mul_small1(struct matmul_bucket_data_s * mm, vector<slice_header_t>::iterator & hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
3113 {
3114     slice_header_t & h(*hdr++);
3115     for(unsigned int i = 0 ; i < h.nchildren ; i++, hdr++) {
3116         ASSERT(hdr != mm->headers.end());
3117         ASSERT(hdr->t == SLICE_TYPE_SMALL1_VBLOCK);
3118         mm->slice_timings[hdr - mm->headers.begin()].t -= wct_seconds();
3119         matmul_bucket_mul_small1_vblock(mm, &*hdr, dst, src, d, pos);
3120         mm->slice_timings[hdr - mm->headers.begin()].t += wct_seconds();
3121     }
3122     hdr--;
3123 }
3124 
matmul_bucket_mul_loop(struct matmul_bucket_data_s * mm,abelt * dst,abelt const * src,int d,struct pos_desc * pos)3125 static inline void matmul_bucket_mul_loop(struct matmul_bucket_data_s * mm, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
3126 {
3127     vector<slice_header_t>::iterator hdr;
3128 
3129     for(hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
3130         unsigned int hidx = hdr - mm->headers.begin();
3131         mm->slice_timings[hidx].t -= wct_seconds();
3132         switch(hdr->t) {
3133             case SLICE_TYPE_SMALL2:
3134                 matmul_bucket_mul_small2(mm, &*hdr, dst, src, d, pos);
3135                 break;
3136             case SLICE_TYPE_SMALL1:
3137                 matmul_bucket_mul_small1(mm, hdr, dst, src, d, pos);
3138                 break;
3139             case SLICE_TYPE_LARGE_ENVELOPE:
3140                 matmul_bucket_mul_large(mm, &*hdr, dst, src, d, pos);
3141                 break;
3142             case SLICE_TYPE_HUGE_ENVELOPE:
3143                 matmul_bucket_mul_huge(mm, &*hdr, dst, src, d, pos);
3144                 break;
3145             case SLICE_TYPE_DEFER_ENVELOPE:
3146                 matmul_bucket_mul_vsc(mm, hdr, dst, src, d, pos);
3147                 break;
3148             /* some slice types are ``contained'', and we should never
3149              * see them. NOTE that this implies in particular that the
3150              * timings for type "defer" (DEFER_ENVELOPE) is counted here,
3151              * thus including all overhead, while the timing for the
3152              * inner objects is counted from within the subroutines. */
3153             case SLICE_TYPE_SMALL1_VBLOCK:
3154             case SLICE_TYPE_DEFER_COLUMN:
3155             case SLICE_TYPE_DEFER_ROW:
3156             case SLICE_TYPE_DEFER_CMB:
3157             case SLICE_TYPE_DEFER_DIS:
3158                 break;
3159             /* The current implementation no longer accepts data not
3160              * obeying the header structure, so the default branch also
3161              * aborts. */
3162             default:
3163                 fprintf(stderr, "Bogus slice type seen: %d\n", hdr->t);
3164                 ASSERT_ALWAYS(0);
3165         }
3166         if (hdr->j1 == pos->ncols_t) { pos->i = hdr->i1; }
3167         mm->slice_timings[hidx].t += wct_seconds();
3168     }
3169 }
3170 
MATMUL_NAME(mul)3171 void MATMUL_NAME(mul)(matmul_ptr mm0, void * xdst, void const * xsrc, int d)
3172 {
3173     struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
3174     struct pos_desc pos[1];
3175 
3176     abdst_vec dst = (abdst_vec) xdst;
3177     absrc_vec src = (absrc_vec) const_cast<void*>(xsrc);
3178     pos->q16 = ptrbegin(mm->t16);
3179     pos->q8 = ptrbegin(mm->t8);
3180     pos->ql = ptrbegin(mm->aux);
3181     pos->i = 0;
3182     pos->nrows_t = mm->public_->dim[ mm->public_->store_transposed];
3183     pos->ncols_t = mm->public_->dim[!mm->public_->store_transposed];
3184 
3185     abdst_field x = mm->xab;
3186 
3187     mm->main_timing.t -= wct_seconds();
3188 
3189     if (d == !mm->public_->store_transposed) {
3190         /* This is the ``normal'' case (matrix times vector). */
3191     } else {
3192         /* d == mm->public_->store_transposed */
3193         /* BEWARE, it's a priori sub-optimal ! In practice, the
3194          * difference isn't so striking though. */
3195         if (mm->public_->iteration[d] == 10) {
3196             fprintf(stderr, "Warning: Doing many iterations with bad code\n");
3197         }
3198         /* We zero out the dst area beforehand */
3199         abvec_set_zero(x, dst, pos->ncols_t);
3200     }
3201     matmul_bucket_mul_loop(mm, dst, src, d, pos);
3202 
3203     mm->main_timing.t += wct_seconds();
3204 
3205     mm->public_->iteration[d]++;
3206 }
3207 
matmul_bucket_report_vsc(std::ostream & os,struct matmul_bucket_data_s * mm,double scale,vector<slice_header_t>::iterator & hdr,double * p_t_total)3208 static std::ostream& matmul_bucket_report_vsc(std::ostream& os, struct matmul_bucket_data_s * mm, double scale, vector<slice_header_t>::iterator & hdr, double * p_t_total)
3209 {
3210     uint64_t scale0;
3211     scale0 = (mm->public_->iteration[0] + mm->public_->iteration[1]);
3212     unsigned int nvstrips = hdr->nchildren;
3213     hdr++;
3214     unsigned int nsteps = hdr->nchildren;
3215     vector<pair<uint64_t, double> > dtime(nsteps);
3216     vector<pair<uint64_t, double> > ctime(nsteps);
3217     for(unsigned int k = 0 ; k < nvstrips ; k++) {
3218         ASSERT_ALWAYS(hdr->t == SLICE_TYPE_DEFER_COLUMN);
3219         hdr++;
3220     }
3221     // hdr+=nvstrips;
3222     for(unsigned int k = 0 ; k < nvstrips ; k++) {
3223         for(unsigned int l = 0 ; l < nsteps ; l++) {
3224             ASSERT_ALWAYS(hdr->t == SLICE_TYPE_DEFER_DIS);
3225             double t = mm->slice_timings[hdr - mm->headers.begin()].t;
3226             uint64_t nc = hdr->ncoeffs;
3227             dtime[l].first += nc;
3228             dtime[l].second += t;
3229             hdr++;
3230         }
3231     }
3232     double total_from_defer_rows = 0;
3233     double total_from_defer_cmbs = 0;
3234     for(unsigned int l = 0 ; l < nsteps ; l++) {
3235         ASSERT_ALWAYS(hdr->t == SLICE_TYPE_DEFER_ROW);
3236         double t = mm->slice_timings[hdr - mm->headers.begin()].t;
3237         uint64_t nc = hdr->ncoeffs;
3238         ctime[l].first += nc;
3239         ctime[l].second += t;
3240         total_from_defer_rows+=t;
3241         hdr++;
3242     }
3243     /* Skip the combining blocks, because they're accounted for already
3244      * by the row blocks */
3245     for( ; hdr != mm->headers.end() && hdr->t == SLICE_TYPE_DEFER_CMB ; hdr++) {
3246         double t = mm->slice_timings[hdr - mm->headers.begin()].t;
3247         total_from_defer_cmbs+=t;
3248     }
3249     /* Some jitter will appear if transposed mults are performed, because
3250      * for the moment transposed mults don't properly store timing info
3251      */
3252     /*
3253     printf("jitter %.2f - %.2f = %.2f\n",
3254             total_from_defer_rows / scale0, total_from_defer_cmbs / scale0,
3255             (total_from_defer_rows - total_from_defer_cmbs) / scale0);
3256             */
3257     for(unsigned int l = 0 ; l < nsteps ; l++) {
3258         double t;
3259         uint64_t nc;
3260         double a;
3261         nc = dtime[l].first;
3262         t = dtime[l].second / scale0;
3263         a = 1.0e9 * t / nc;
3264         *p_t_total += t;
3265         os << fmt::sprintf("defer\t%.2fs         ; n=%-9" PRIu64 " ; %5.2f ns/c ;"
3266             " scaled*%.2f : %5.2f/c\n",
3267             t, nc, a, scale, a * scale);
3268         nc = ctime[l].first;
3269         t = ctime[l].second / scale0;
3270         a = 1.0e9 * t / nc;
3271         *p_t_total += t;
3272         os << fmt::sprintf("      + %.2fs [%.2fs] ; n=%-9" PRIu64 " ; %5.2f ns/c ;"
3273             " scaled*%.2f : %5.2f/c\n",
3274             t, *p_t_total, nc, a, scale, a * scale);
3275     }
3276     hdr--;
3277     return os;
3278 }
3279 
3280 
MATMUL_NAME(report)3281 void MATMUL_NAME(report)(matmul_ptr mm0, double scale)
3282 {
3283     struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
3284     uint64_t scale0;
3285     scale0 = (mm->public_->iteration[0] + mm->public_->iteration[1]);
3286 
3287     std::ostringstream os;
3288 
3289     vector<slice_header_t>::iterator hdr;
3290 
3291     os << fmt::sprintf("n %" PRIu64 " %.3fs/iter (wct of cpu-bound loop)\n",
3292             mm->public_->ncoeffs,
3293             mm->main_timing.t / scale0
3294             );
3295 
3296     double t_total = 0;
3297     for(hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
3298         if (hdr->t == SLICE_TYPE_SMALL1_VBLOCK) continue;
3299         if (hdr->t == SLICE_TYPE_DEFER_ENVELOPE) {
3300             matmul_bucket_report_vsc(os, mm, scale, hdr, &t_total);
3301             continue;
3302         }
3303         double t = mm->slice_timings[hdr - mm->headers.begin()].t;
3304         uint64_t nc = hdr->ncoeffs;
3305         t /= scale0;
3306         t_total += t;
3307         double a = 1.0e9 * t / nc;
3308         os << fmt::sprintf("%s\t%.2fs [%.2fs] ; n=%-9" PRIu64 " ; %5.2f ns/c ;"
3309             " scaled*%.2f : %5.2f/c\n",
3310             slice_name(hdr->t), t, t_total,
3311             nc, a, scale, a * scale);
3312     }
3313     for(int i = 0 ; i < 40 ; i++) os << '-';
3314     os << '\n';
3315     for(int i = 0 ; i < SLICE_TYPE_MAX ; i++) {
3316         if (i == SLICE_TYPE_SMALL1_VBLOCK) continue;
3317         double t = 0;
3318         uint64_t nc = 0;
3319         for(hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
3320             if (hdr->t != i) continue;
3321             nc += hdr->ncoeffs;
3322             t += mm->slice_timings[hdr - mm->headers.begin()].t;
3323         }
3324         if (nc == 0) continue;
3325         t /= scale0;
3326         double a = 1.0e9 * t / nc;
3327         os << fmt::sprintf("%s\t%.2fs ; n=%-9" PRIu64 " ; %5.2f ns/c ;"
3328             " scaled*%.2f : %5.2f/c\n",
3329             slice_name(i), t,
3330             nc, a, scale, a * scale);
3331     }
3332     if (mm->public_->report_string)
3333         free(mm->public_->report_string);
3334     mm->public_->report_string_size = os.str().size() + 1;
3335     mm->public_->report_string = strdup(os.str().c_str());
3336 }
3337 
MATMUL_NAME(auxv)3338 void MATMUL_NAME(auxv)(matmul_ptr mm0, int op MAYBE_UNUSED, va_list ap MAYBE_UNUSED)
3339 {
3340     struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
3341     if (op == MATMUL_AUX_ZERO_STATS) {
3342         matmul_bucket_zero_stats(mm);
3343         mm->public_->iteration[0] = 0;
3344         mm->public_->iteration[1] = 0;
3345     }
3346 #if 0
3347     if (op == MATMUL_AUX_GET_READAHEAD) {
3348         unsigned int * res = va_arg(ap, unsigned int *);
3349         ++*res;
3350     }
3351 #endif
3352 }
3353 
3354 
MATMUL_NAME(aux)3355 void MATMUL_NAME(aux)(matmul_ptr mm0, int op, ...)
3356 {
3357     va_list ap;
3358     va_start(ap, op);
3359     MATMUL_NAME(auxv) (mm0, op, ap);
3360     va_end(ap);
3361 }
3362 /* vim: set sw=4: */
3363