1 /* Manage the in-memory data for the matrix */
2
3 /* It's in C++ because the STL is handy, but that's really all there is
4 * to it ; a conversion to C would not be extremely difficult */
5
6 #include "cado.h" // IWYU pragma: keep
7 // IWYU pragma: no_include <memory>
8 #include <cinttypes>
9 #include <climits>
10 #include <cmath>
11 #include <cstdarg> // for va_list, va_end, va_start
12 #include <cstddef> // size_t, NULL
13 #include <cstdint>
14 #include <cstdio>
15 #include <cstdlib>
16 #include <cstring>
17 // C++ headers.
18 #include <algorithm> // sort
19 #include <deque>
20 #include <limits> // underlying_type_{min,max}
21 #include <list>
22 #include <sstream> // ostringstream // IWYU pragma: keep
23 #include <string> // for basic_string
24 #include <type_traits> // for __strip_reference_wrapper<>::__type
25 #include <utility> // for pair, make_pair, swap
26 #include <vector>
27
28 #include "fmt/core.h" // for check_format_string
29 #include "fmt/format.h" // for basic_buffer::append, basic_parse_context...
30 #include "fmt/printf.h" // fmt::fprintf // IWYU pragma: keep
31 #include "matmul.h" // for matmul_ptr, matmul_public_s, MATMUL_AUX_Z...
32 #include "macros.h"
33 #include "verbose.h" // CADO_VERBOSE_PRINT_BWC_CACHE_BUILD
34 #include "timing.h" // wct_seconds
35 #include "mpfq_layer.h"
36
37 using namespace std;
38
39 /* Make sure that the assembly function is only called if it matches
40 * correctly the abase header !! */
41 #if defined(HAVE_GAS_SYNTAX_ASSEMBLY_SOURCES) && defined(SELECT_MPFQ_LAYER_u64k1)
42 // disabling one particular assembly code is done by simply disabling the
43 // header file (and optionally removing the assembly file from the link
44 // list is the CMakeLists.txt file, but in reality it's not needed. This
45 // file being C++, the names are mangled if no header file marks them as
46 // having C linkage. So commenting out is enough).
47 #include "matmul-sub-small1.h" // IWYU pragma: keep
48 #include "matmul-sub-small2.h" // IWYU pragma: keep
49 #include "matmul-sub-large-fbd.h" // IWYU pragma: keep
50 #include "matmul-sub-large-fbi.h" // IWYU pragma: keep
51 #include "matmul-sub-vsc-dispatch.h" // IWYU pragma: keep
52 #include "matmul-sub-vsc-combine.h" // IWYU pragma: keep
53 // #define ENABLE_ASM
54 #endif
55
56 #include "matmul-common.h"
57
58 #include "matmul_facade.h"
59 #include "portability.h" // strdup // IWYU pragma: keep
60 #include "params.h"
61
62 /* {{{ Documentation
63 *
64 * Parameters referred to by this documentation text may be tuned via
65 * #define's below.
66 *
67 * This implementation builds upon the ``sliced'' variant.
68 * The format used with the -sliced implementation is still used, but
69 * there is also another format used for very dense strips. We refer to
70 * matmul-sliced.cpp for the first format, which is called ``small1''
71 * here (and there also).
72 *
73 * The ``small2'' format is specialised for denser strips. Within an
74 * horizontal strip of less than 4096 rows, when two non-zero columns are
75 * never more than 16 positions apart, then the information (dj,i) is
76 * stored in 16 bits instead of 32. This reduces the memory footprint
77 * quite considerably, since many coefficients are in dense strips.
78 * Speed-wise, small2 strips provide a nice speed-up sometimes (not
79 * always).
80 *
81 * When we reach sparser areas, we use another method. Dense (small2 and
82 * small1) slices having been discarded, the rest is split in ``large''
83 * slices of LSL_NBUCKETS_MAX*256 rows at most (in fact we arrange for the
84 * slices to have equal size, so it's a bit less). For each such large
85 * slice, we use scratch space equal to the L2 cache size for performing
86 * matrix multiplication. While reading column vector data, we copy
87 * coefficient to (up to) LSL_NBUCKETS_MAX ``buckets'', each representing
88 * 256 possible row indices. The dispatching information, telling in
89 * which bucket a given coefficient has to be copied, is an 8-bit integer
90 * stored in a table called ``main''. Each entry in table ``main'' also
91 * has a second 8-bit integer indicating the offset to the next useful
92 * column entry (because we're considering very sparse blocks, it is not
93 * at all uncommon to see average values ~ 10 here, even when
94 * LSL_NBUCKETS_MAX==256, so that 65536 rows are considered
95 * simultaneously).
96 *
97 * The number of columns considered while filling the scratch space is
98 * limited by the scratch space size. Once it's filled, we ``apply the
99 * buckets'', so as to be able to reuse our scratch space buffer for
100 * another batch of columns. Thus a ``large slice'' is typically split
101 * into several ``vertical blocks''. Extremely sparse large slices will
102 * have only one vertical block.
103 *
104 * ``Applying the buckets'' means taking a list of column coefficients
105 * which have been directed to the bucket because it is known that they
106 * affect at least one of the 256 corresponding row indices on output. So
107 * in addition to the N coefficients stored in the bucket, we have a
108 * list of N 8-bit integers indicating which row index is to be modified.
109 *
110 * [the section below is superseded by the next one]
111 * That's it for the basic idea. Now on top of that, we use an extra
112 * indirection level for very sparse blocks. Several ``large blocks'' are
113 * considered together as part of a ``huge block''. Call this number
114 * ``nlarge''. Source coefficients are first dispatched in nlarge ``big
115 * buckets'' of appropriate size. The number of columns considered is
116 * guided by the fact that we want none of these big buckets to exceed
117 * the L2 cache size. Afterwards, we do a second bispatching from each
118 * of these big buckets (in turn) to (up to) LSL_NBUCKETS_MAX small
119 * buckets in the same manner as above (except that we no longer need the
120 * information on the link to the next useful column.
121 *
122 * [documentation of the new format]
123 * After ``large'' blocks, we consider ``deferred'' blocks, also referred
124 * to as (vertical) staircase-shaped blocks. The idea is the following.
125 * The portion of the matrix under study is split into several vertical
126 * blocks of at most 2^16 entries. In the other direction, we make splits
127 * according to the average row density. A first batch of row has some
128 * indicative density X, the next one X/2, and so on (the ratio 2 being
129 * in fact VSC_BLOCKS_FLUSHFREQ_RATIO) until the sparsest block at the
130 * bottom. Such horizontal blocks need not have the same size.
131 *
132 * Vertical strips are processed in order. Buffers corresponding to the
133 * sub-matrices in the different sub-matrices created by the horizontal
134 * splitting are filled. We thus populate scratch space with coefficients
135 * from the source vector. After each such processing of a vertical
136 * strip, we consider whether we have to perform flushing w.r.t. the
137 * horizontal strips. The buffers corresponding to the densest horizontal
138 * strip are flushed after each vstrip processing. Those which are twice
139 * sparser are flushed twice less often. Thus the next vstrip occupies
140 * some more buffer space. This pattern goes until the sparsest block,
141 * which is probably flushed only once. The total amount of buffer space
142 * needed is proportional to the number of coefficients in an area having
143 * a staircase shape. Assume we have N vstrips of width W. Assume
144 * horizontal blocks have a number of rows R0, R1, R2, ... corresponding
145 * to densities X, X/2, X/4, ... (the ratio 2 here is in fact given by
146 * VSC_BLOCKS_FLUSHFREQ_RATIO). The formula
147 * for the tbuf space is:
148 * max weight of the N matrices of size R0*W in h. strip #0
149 * + max weight of the N/2 matrices of size R1*2W in h. strip #1
150 * + max weight of the N/4 matrices of size R2*4W in h. strip #2
151 * ...
152 * It should be noted that the numbers added up in this formula are
153 * expected to be of the same magnitude asymptotically. For real-life
154 * examples though, it's pretty common to observe somewhat degenerate
155 * cases.
156 *
157 * }}} */
158
159
160 /* {{{ Documentation for parameters */
161 // take only a portion of the L1 cache.
162 // #define L1_CACHE_SIZE 192000
163 // for 8-bytes abt values, this gives 3500 items.
164 // note that allowing more than 4096 items here (or anything
165 // SMALL_SLICES_I_BITS dictates) effectively disables small2 slices,
166 // which pack (i,dj) values in 16 bits.
167
168 // tunables for small2 format.
169 #define SMALL_SLICES_I_BITS 12
170 #define SMALL_SLICES_DJ_BITS (16 - SMALL_SLICES_I_BITS)
171
172 // make sure that it's something each core can use (i.e. divide by two
173 // the L2 cache size for a dual-core w/ shared L2).
174 // #define L2_CACHE_SIZE 1600000
175
176 /* To what should we compare the average dj value for determining when we
177 * should switch from dense (small) to sparse (large) blocks */
178 // #define DJ_CUTOFF1 8.0
179
180 /* Same, but for stopping large blocks and go to the next type
181 * (currently, deferred blocks). Note thatthe time for large blocks grows
182 * faster with multi-cores, probably because of the strong dependency on
183 * TLB availability.
184 */
185 // #define DJ_CUTOFF2 24.0
186
187 /* How many buckets in large slices, and in large slices which are
188 * contained in huge slices. */
189 // #define LSL_NBUCKETS_MAX 256
190
191 /* How many large slices MINIMUM must go in a huge slice ? Below this
192 * value, we keep going producing large slices. */
193 // note that this part of the code is currently inactive
194 #define HUGE_MPLEX_MIN 2 /* default so that the code compiles */
195
196 /* Here, I've seen 97 slices be handled best with 2 mplexed rounds. So
197 * let's fix 64. */
198 // note that this part of the code is currently inactive
199 #define HUGE_MPLEX_MAX 64 /* default so that the code compiles */
200
201 /* read batches of VSC_BLOCKS_ROW_BATCH rows, in order to have something
202 * which is less sensible to variations incurred by splitting the matrix
203 * */
204 // #define VSC_BLOCKS_ROW_BATCH 512
205
206 /* blocks smaller than this amount will be merged with previous / next
207 * block */
208 // #define VSC_BLOCKS_TOO_SMALL_CUTOFF 8192
209
210 /* The number of flush frequencies (number of staircase steps) is
211 * inversely proportional to this. The larger this value, the taller the
212 * steps, and in return, the larger the temp buffers. The effect is
213 * generally mild but noticeable.
214 */
215 // #define VSC_BLOCKS_FLUSHFREQ_RATIO 3
216 /* }}} */
217
218 /* {{{ Examples of parameter sets */
219 /* These defaults work well for 13.4M rows/cols 4x4 submatrix of
220 * snfs247.small on a Xeon L5640 (Westmere), for a single core. Note that
221 * we define L1_CACHE_SIZE to something insane, because in fact we're
222 * utilizing L2.
223 */
224 #if 0
225 #define L1_CACHE_SIZE 192000
226 #define L2_CACHE_SIZE 1600000
227 #define DJ_CUTOFF1 8.0
228 #define DJ_CUTOFF2 24.0
229 #define LSL_NBUCKETS_MAX 256
230 #define VSC_BLOCKS_ROW_BATCH 512
231 #define VSC_BLOCKS_TOO_SMALL_CUTOFF 8192
232 #define VSC_BLOCKS_FLUSHFREQ_RATIO 3
233 #endif
234
235 /* This parameter set is successful for 4.6M rows x 2.8 cols, 12x20
236 * submatrix of the same snfs247.sparse, for 4 simultaneous cores of a
237 * Xeon X3440. In effect, we're disabling large slices here, and use
238 * taller steps for vsc.
239 */
240 #if 1
241 #define L1_CACHE_SIZE 262144
242 #define L2_CACHE_SIZE 800000
243 #define DJ_CUTOFF1 8.0
244 #define DJ_CUTOFF2 24.0 /* This is so small that large slices don't show up */
245 #define LSL_NBUCKETS_MAX 32
246 #define VSC_BLOCKS_ROW_BATCH 256
247 #define VSC_BLOCKS_TOO_SMALL_CUTOFF 8192
248 #define VSC_BLOCKS_FLUSHFREQ_RATIO 8.0
249 #endif
250
251 /* These used to be the cado-nfs defaults for a long time. They are
252 * visibly inferior to the previous parameters on the L5640. */
253 #if 0
254 #define L1_CACHE_SIZE 28000
255 #define L2_CACHE_SIZE 1600000
256 #define DJ_CUTOFF1 4.5
257 #define DJ_CUTOFF2 8.0
258 #define LSL_NBUCKETS_MAX 256
259 #define VSC_BLOCKS_ROW_BATCH 256
260 #define VSC_BLOCKS_TOO_SMALL_CUTOFF 8192
261 #define VSC_BLOCKS_FLUSHFREQ_RATIO 1.7
262 #endif
263 /* }}} */
264
265 #define xxxDEBUG_BUCKETS
266
267 /* This extension is used to distinguish between several possible
268 * implementations of the matrix-times-vector product.
269 */
270 #define MM_EXTENSION "-bucket"
271
272 /* MM_MAGIC is stored in files, so as to check compatibility. The upper
273 * word (MM_MAGIC_FAMILY) correspond to the implementation itself, the
274 * lower one (MM_MAGIC_VERSION) to the n-th binary incompatible change
275 * (make sure to bump it) */
276 #define MM_MAGIC_FAMILY 0xa003UL
277 #define MM_MAGIC_VERSION 0x1015UL
278 #define MM_MAGIC (MM_MAGIC_FAMILY << 16 | MM_MAGIC_VERSION)
279
280 /* see matmul-basic.c */
281 #define MM_DIR0_PREFERS_TRANSP_MULT 1
282
ptrbegin(vector<T> & v)283 template<typename T> inline T * ptrbegin(vector<T>& v) { return v.data(); }
ptrbegin(vector<T> const & v)284 template<typename T> inline T const * ptrbegin(vector<T> const & v) { return v.data(); }
ptrend(vector<T> & v)285 template<typename T> inline T * ptrend(vector<T>& v) { return v.size() + ptrbegin(v); }
ptrend(vector<T> const & v)286 template<typename T> inline T const * ptrend(vector<T> const & v) { return v.size() + ptrbegin(v); }
287
288 #if 0
289 static unsigned int idiotic_sum(void * p, unsigned int nbytes)
290 {
291 uint8_t * q = (uint8_t *) p;
292 unsigned int res = 0x12345678;
293 for( ; nbytes-- ; q++) {
294 unsigned int z = *q;
295 z ^= z << 24;
296 res = (((res << 8) ^ (res * z)) - (z ^ 0xabababab)) ^ res >> 13;
297 }
298 return res;
299 }
300 #endif
301
302 #define SLICE_TYPE_NONE 0
303 #define SLICE_TYPE_SMALL1 1
304 #define SLICE_TYPE_SMALL1_VBLOCK 2
305 #define SLICE_TYPE_SMALL2 3
306
307 #define SLICE_TYPE_LARGE_ENVELOPE 4
308
309 /* When coefficients go through several passes, they are considered as
310 * one slice for each pass. Note that the ordering of some slices is
311 * mandated by the design, e.g. slices of type LARGE_ASB must follow
312 * immediately their parent LARGE_FBI
313 */
314
315 #if 0
316 #define SLICE_TYPE_LARGE_FBI 5
317 #define SLICE_TYPE_LARGE_ASB 6
318 #endif
319
320 #define SLICE_TYPE_HUGE_ENVELOPE 7
321
322 #if 0
323 #define SLICE_TYPE_HUGE_FBI 8
324 #define SLICE_TYPE_HUGE_FBD 9
325 #define SLICE_TYPE_HUGE_ASB 10
326 #endif
327
328 #define SLICE_TYPE_DEFER_ENVELOPE 11
329 #define SLICE_TYPE_DEFER_COLUMN 12
330 #define SLICE_TYPE_DEFER_DIS 13
331 #define SLICE_TYPE_DEFER_ROW 14
332 #define SLICE_TYPE_DEFER_CMB 15
333
334 #define SLICE_TYPE_MAX 16
335
336
slice_name(int s)337 static inline const char* slice_name(int s) {
338 switch(s) {
339 case SLICE_TYPE_SMALL1_VBLOCK: return "small1v";
340 case SLICE_TYPE_SMALL1: return "small1";
341 case SLICE_TYPE_SMALL2: return "small2";
342 case SLICE_TYPE_LARGE_ENVELOPE: return "large";
343 case SLICE_TYPE_HUGE_ENVELOPE: return "huge";
344 case SLICE_TYPE_DEFER_ENVELOPE: return "defer";
345 case SLICE_TYPE_DEFER_COLUMN: return "d-col";
346 case SLICE_TYPE_DEFER_DIS: return "d-dis";
347 case SLICE_TYPE_DEFER_ROW: return "d-row";
348 case SLICE_TYPE_DEFER_CMB: return "d-cmb";
349 default: return "other";
350 }
351 }
352
353 struct slice_header_t {
354 uint16_t t;
355 uint16_t nchildren; // w.r.t the tree-like shape of the block structure.
356 uint8_t pad[4]; // make the structure 256-bit wide.
357 uint32_t i0, i1;
358 uint32_t j0, j1;
359 uint64_t ncoeffs;
360 };
361
362 /* Ok, there's only one field for the moment. But there might be more
363 * eventually */
364 struct slice_runtime_stats {
365 double t;
slice_runtime_statsslice_runtime_stats366 slice_runtime_stats() : t(0) {}
367 };
368
369 struct matmul_bucket_methods {
370 int small1;
371 int small2;
372 int large;
373 int huge;
374 int vsc;
matmul_bucket_methodsmatmul_bucket_methods375 inline matmul_bucket_methods(const char * desc = NULL) {
376 small1=small2=large=vsc=huge=0;
377 if (!desc) {
378 /* default configuration */
379 small1=small2=large=vsc=1;
380 return;
381 }
382 const char * p = desc;
383 for( ; ; ) {
384 const char * q = strchr(p, ',');
385 int n = q ? q-p : strlen(p);
386 if (strncmp(p, "small1", n) == 0) {
387 ASSERT_ALWAYS(!small1);
388 small1=1;
389 } else if (strncmp(p, "small2", n) == 0) {
390 ASSERT_ALWAYS(!small2);
391 small2=1;
392 } else if (strncmp(p, "large", n) == 0) {
393 ASSERT_ALWAYS(!large);
394 large=1;
395 } else if (strncmp(p, "huge", n) == 0) {
396 ASSERT_ALWAYS(!huge && !vsc);
397 huge=1;
398 } else if (strncmp(p, "vsc", n) == 0) {
399 ASSERT_ALWAYS(!huge && !vsc);
400 vsc=1;
401 } else {
402 fprintf(stderr, "Parse error: %s\n", p);
403 }
404 if (!q) break;
405 p = q + 1;
406 }
407 ASSERT_ALWAYS(small1||!small2);
408 ASSERT_ALWAYS(small1||small2||large||vsc||huge);
409 }
operator <matmul_bucket_methods410 inline bool operator<(matmul_bucket_methods const& o) {
411 if (vsc != o.vsc) return vsc < o.vsc;
412 if (huge != o.huge) return huge < o.huge;
413 if (large != o.large) return large < o.large;
414 if (small1 != o.small1) return small1 < o.small1;
415 if (small2 != o.small2) return small2 < o.small2;
416 return false;
417 }
something_beyondmatmul_bucket_methods418 inline bool something_beyond(const char * s) const {
419 matmul_bucket_methods o(s);
420 if (o.huge || o.vsc) return false;
421 if (o.large) return huge || vsc;
422 if (o.small1 || o.small2) return large || huge || vsc;
423 return true;
424 }
425 };
426
427 struct matmul_bucket_data_s {
428 /* repeat the fields from the public_ interface */
429 struct matmul_public_s public_[1];
430 /* now our private fields */
431 abdst_field xab;
432 size_t npack;
433 size_t scratch1size;
434 size_t scratch2size;
435 size_t scratch3size;
436 abelt * scratch1;
437 abelt * scratch2;
438 abelt * scratch3;
439 vector<uint16_t> t16; /* For small (dense) slices */
440 vector<uint8_t> t8; /* For large (sparse) slices */
441 vector<unsigned int> aux; /* Various descriptors -- fairly small */
442 /* headers are the first thing found in memory */
443 vector<slice_header_t> headers;
444 slice_runtime_stats main_timing;
445 vector<slice_runtime_stats> slice_timings;
446 matmul_bucket_methods methods;
447 };
448
MATMUL_NAME(clear)449 void MATMUL_NAME(clear)(matmul_ptr mm0)
450 {
451 struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
452 if (mm->scratch1) abvec_clear(mm->xab, &mm->scratch1, mm->scratch1size);
453 if (mm->scratch2) abvec_clear(mm->xab, &mm->scratch2, mm->scratch2size);
454 if (mm->scratch3) abvec_clear(mm->xab, &mm->scratch3, mm->scratch3size);
455
456 matmul_common_clear(mm->public_);
457 // delete properly calls the destructor for members as well.
458 delete mm;
459 }
460
461 static void mm_finish_init(struct matmul_bucket_data_s * mm);
462
MATMUL_NAME(init)463 matmul_ptr MATMUL_NAME(init)(void * pxx, param_list pl, int optimized_direction)
464 {
465 abdst_field xx = (abdst_field) pxx;
466 struct matmul_bucket_data_s * mm;
467 mm = new matmul_bucket_data_s;
468
469 /* We first make sure that everything gets to zero ; except that it
470 * may be more complicated than just calling memset, because we have
471 * some vector() types in there...
472 */
473 memset(mm->public_, 0, sizeof(struct matmul_public_s));
474 // memset(mm, 0, sizeof(struct matmul_bucket_data_s));
475 mm->scratch1size = 0; mm->scratch1 = NULL;
476 mm->scratch2size = 0; mm->scratch2 = NULL;
477 mm->scratch3size = 0; mm->scratch3 = NULL;
478
479 mm->xab = xx;
480
481 unsigned int npack = L1_CACHE_SIZE;
482 if (pl) param_list_parse_uint(pl, "l1_cache_size", &npack);
483 npack /= sizeof(abelt);
484 mm->npack = npack;
485
486 unsigned int scratch1size = L2_CACHE_SIZE/2;
487 if (pl) param_list_parse_uint(pl, "l2_cache_size", &scratch1size);
488 scratch1size /= sizeof(abelt);
489 mm->scratch1size = scratch1size;
490
491 unsigned int scratch2size;
492 scratch2size = scratch1size * HUGE_MPLEX_MAX; // (1 << 24)/sizeof(abelt));
493 mm->scratch2size = scratch2size;
494
495 int suggest = optimized_direction ^ MM_DIR0_PREFERS_TRANSP_MULT;
496 mm->public_->store_transposed = suggest;
497 if (pl) {
498 param_list_parse_int(pl, "mm_store_transposed",
499 &mm->public_->store_transposed);
500 if (mm->public_->store_transposed != suggest) {
501 fprintf(stderr, "Warning, mm_store_transposed"
502 " overrides suggested matrix storage ordering\n");
503 }
504 }
505
506 const char * tmp = NULL;
507
508 if (pl)
509 tmp = param_list_lookup_string(pl, "matmul_bucket_methods");
510
511 mm->methods = matmul_bucket_methods(tmp);
512
513 return (matmul_ptr)(mm);
514 }
515
516 /* This moves an element at the tail of a list with no copy, transferring
517 * the ownership to the container argument */
518 template<typename T>
transfer(list<T> * ctr,T * elem)519 void transfer(list<T> * ctr, T * elem)
520 {
521 ctr->push_back(T());
522 swap(ctr->back(), *elem);
523 }
524
underlying_type_max(T const &)525 template<typename T> T underlying_type_max(T const&) { return numeric_limits<T>::max(); }
underlying_type_min(T const &)526 template<typename T> T underlying_type_min(T const&) { return numeric_limits<T>::min(); }
527 template<typename T, typename U> T safe_assign(T & a, U const& b) {
528 ASSERT_ALWAYS(b <= numeric_limits<T>::max());
529 ASSERT_ALWAYS(b >= numeric_limits<T>::min());
530 return a = b;
531 }
532
533 struct builder {
534 uint32_t * data[2];
535 // rowhead is a pointer which is naturally excpected to *move* within
536 // the data[0] array.
537 uint32_t * rowhead;
538 uint32_t nrows_t;
539 uint32_t ncols_t;
540 const char * rowname;
541 const char * colname;
542 /* Statistics on the prime parameter affecting performance of
543 * subroutines. All are computed as (total sum, #samples) */
544 struct matmul_bucket_data_s * mm;
545 };
546
builder_init(builder * mb,struct matmul_bucket_data_s * mm,uint32_t * data)547 static void builder_init(builder * mb, struct matmul_bucket_data_s * mm, uint32_t * data)
548 {
549 memset(mb, 0, sizeof(struct builder));
550 ASSERT_ALWAYS(data);
551 mb->data[0] = data;
552 mb->data[1] = NULL;
553
554 mb->nrows_t = mm->public_->dim[ mm->public_->store_transposed];
555 mb->ncols_t = mm->public_->dim[!mm->public_->store_transposed];
556
557 mb->rowname = rowcol[ mm->public_->store_transposed];
558 mb->colname = rowcol[!mm->public_->store_transposed];
559 mb->rowhead = mb->data[0];
560 mb->mm = mm;
561 }
562
builder_clear(builder * mb)563 static void builder_clear(builder * mb)
564 {
565 free(mb->data[0]);
566 memset(mb, 0, sizeof(struct builder));
567 }
568
569 /************************************/
570 /* Elementary (!) buliding routines */
571
572 /* there's room for a slice type where data is stored row-major. It would
573 * make it possible to avoid most stores, for a nice performance gain. We
574 * might even consider accumulating four rows at a time to four different
575 * registers, so that the loads are compensated. The in-memory format
576 * must account for that.
577 */
578
579 /* {{{ small slices */
580
581 struct small_slice_t {
582 uint32_t i0;
583 uint32_t i1;
584 unsigned int ncoeffs;
585 double dj_avg;
586 uint32_t dj_max;
587 int is_small2;
588
589 typedef std::vector<std::pair<uint16_t, uint16_t> > Lv_t;
590 typedef Lv_t::const_iterator Lvci_t;
591 typedef Lv_t::iterator Lvi_t;
592
593 typedef std::deque<Lv_t> Lu_t;
594 typedef Lu_t::const_iterator Luci_t;
595 typedef Lu_t::iterator Lui_t;
596
597 Lu_t Lu;
598 };
599
builder_do_small_slice(builder * mb,struct small_slice_t * S,uint32_t i0,uint32_t i1)600 static int builder_do_small_slice(builder * mb, struct small_slice_t * S, uint32_t i0, uint32_t i1)
601 {
602 S->i0 = i0;
603 S->i1 = i1;
604 S->ncoeffs = 0;
605 ASSERT_ALWAYS(i1-i0 <= (1 << 16) );
606
607 /* Make enough vstrips */
608 S->Lu.assign(iceildiv(mb->ncols_t, 1UL << 16), small_slice_t::Lv_t());
609 uint32_t * ptr0 = mb->rowhead;
610 /* We're doing a new slice */
611 for(uint32_t i = i0 ; i < i1 ; i++) {
612 for(unsigned int j = 0 ; j < *mb->rowhead ; j++) {
613 uint32_t jj = mb->rowhead[1+j];
614 S->Lu[jj>>16].push_back(std::make_pair(jj%(1<<16),i-i0));
615 S->ncoeffs++;
616 }
617 mb->rowhead += 1 + *mb->rowhead;
618 }
619 /* L is the list of (column index, row index) of all
620 * coefficients in the current horizontal slice */
621
622 /* Convert all j indices to differences */
623 S->dj_max = 0;
624 S->dj_avg = mb->ncols_t / (double) (S->ncoeffs + !S->ncoeffs);
625
626 typedef small_slice_t::Lui_t Lui_t;
627 typedef small_slice_t::Lvci_t Lvci_t;
628 uint32_t lu_offset = 0;
629 uint32_t j = 0;
630 for(Lui_t lup = S->Lu.begin() ; lup != S->Lu.end() ; ++lup, lu_offset+=1<<16) {
631 std::sort(lup->begin(), lup->end());
632 for(Lvci_t lvp = lup->begin() ; lvp != lup->end() ; ++lvp) {
633 uint32_t dj = (lu_offset + lvp->first) - j;
634 j = lu_offset + lvp->first;
635 if (dj > S->dj_max) { S->dj_max = dj; }
636 }
637 }
638
639 /* There are two possible reasons for this slice to be discarded.
640 * Either the max dj is too large -- larger than 16 bits -- or the
641 * average dj is too large -- larger than some guessed value.
642 */
643 /* Now that we're splitting in vblocks anyway, it's not important to
644 * constrain dj below 2^16 for small1 */
645 int keep1 = 1; // S->dj_max < (1UL << 16);
646 int keep2 = S->dj_max < (1 << SMALL_SLICES_DJ_BITS);
647
648 if ((i1-i0) >> SMALL_SLICES_I_BITS) keep2=0;
649
650 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
651 if (!keep1) printf(" [cannot be small1, beyond impl limits]");
652 if (!keep2) printf(" [cannot be small2, beyond impl limits]");
653 }
654
655 if (!mb->mm->methods.small2) keep2=0;
656
657 if (mb->mm->methods.something_beyond("small1,small2")) {
658 /* Then we have more than just small slices. So we're enforcing
659 * our separation criterion based on DJ */
660 keep1 = keep1 && S->dj_avg < DJ_CUTOFF1;
661 keep2 = keep2 && S->dj_avg < DJ_CUTOFF1;
662 }
663
664 S->is_small2 = keep2;
665
666 if (!keep1 && !keep2) {
667 mb->rowhead = ptr0;
668 }
669 return keep1 || keep2;
670 }
671
672 /* }}} */
673
674 /* {{{ large slices */
675
676 struct large_slice_vblock_t {
677 uint32_t j0;
678 uint32_t j1;
679 unsigned int n; // number of coeffs.
680 unsigned int pad; // (half the) number of padding coeffs.
681 vector<uint8_t> t8c; // read: contribution to t8.
682 vector<unsigned int> auxc; // read: contribution to aux.
683 };
684
685 struct large_slice_t {
686 slice_header_t hdr[1];
687 double dj_avg;
688 uint32_t dj_max;
689 list<large_slice_vblock_t> vbl;
690 };
691
692 struct large_slice_raw_t {
693 vector<uint8_t> ind[LSL_NBUCKETS_MAX];
694 vector<uint8_t> main;
695 vector<uint32_t> col_sizes;
696 vector<uint32_t> pad_sizes;
697 };
698
split_large_slice_in_vblocks(builder * mb,large_slice_t * L,large_slice_raw_t * R,unsigned int scratch1size)699 static void split_large_slice_in_vblocks(builder * mb, large_slice_t * L, large_slice_raw_t * R, unsigned int scratch1size)
700 {
701 /* Now split into vslices */
702 uint8_t * mp = ptrbegin(R->main);
703 uint8_t * ip[LSL_NBUCKETS_MAX];
704 for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
705 ip[k] = ptrbegin(R->ind[k]);
706 }
707 unsigned int vblocknum = 0;
708 double vbl_ncols_variance = 0;
709 for(uint32_t j = 0 ; j < mb->ncols_t ; vblocknum++) {
710 uint32_t j0 = j;
711 size_t n = 0;
712 size_t np = 0;
713 for( ; j < mb->ncols_t ; j++) {
714 size_t dn = R->col_sizes[j];
715 size_t dnp = R->pad_sizes[j];
716 if (n + dn + 2 * (np + dnp) > scratch1size) {
717 /* If dn doesn't fit, then we're in really, really big
718 * trouble ! */
719 ASSERT_ALWAYS(n != 0);
720 break;
721 }
722 n += dn;
723 np += dnp;
724 }
725 uint32_t j1 = j;
726
727 large_slice_vblock_t V;
728 V.j0 = j0;
729 V.j1 = j1;
730 /* First the main block, then all the bucket blocks. These come
731 * in order. */
732 /* We have nf coeffs, counting padding. This makes 2nf bytes in
733 * main, and nf bytes in the bucket blocks */
734 V.n = n;
735 V.pad = np;
736 size_t nf = n + 2 * np;
737 V.auxc.reserve(258);
738 V.auxc.push_back(V.j1 - V.j0);
739 /* XXX Notice that the reader process will consider as a j0 index
740 * the farthest possible index from the previous vblock, which
741 * means that we _must_ start with a zero in the main block here
742 * no matter what -- even if with regard to the way data is set
743 * up at the upper level (huge unique vblock), we would have had
744 * a positive integer. This digression does not apply to the
745 * first vblock.
746 *
747 * In the case where we're starting with a padding coefficient,
748 * then we might actually end up needing to cancel out _several_
749 * coefficients, since the padding coeffs need to be cancelled as
750 * well.
751 */
752 V.t8c.resize(3 * nf);
753 uint8_t * q = ptrbegin(V.t8c);
754 memcpy(q, mp, 2 * nf * sizeof(uint8_t));
755 if (vblocknum) {
756 uint8_t * pmp = q;
757 for(size_t k = 0 ; k + 2 <= nf ; k+=2, pmp+=4) {
758 if (pmp[0] != 255 || pmp[1] || pmp[2] || pmp[3])
759 break;
760 pmp[0] = 0;
761 }
762 if (pmp-q) {
763 /* It's ok to cancel padding coefficients, anyway it
764 * should be very rare. However, it would be bad to
765 * start cancelling a huge number of these, because then we
766 * would be better off adjusting the pointers (peeling
767 * off the phony computation at the beggining of mf[]).
768 * Problem is that in such a case, we would also have to
769 * adjust the indirection pointer as well, which would be
770 * messy. */
771 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
772 "*** cancelled %td padding coefficient\n", (pmp-q)/4);
773 }
774 pmp[0] = 0;
775 }
776 unsigned int ind_sizes[LSL_NBUCKETS_MAX] = {0,};
777 q += 2*nf;
778 for(size_t k = 0 ; k < nf ; k++) {
779 ind_sizes[mp[2 * k + 1]]++;
780 }
781 mp += 2 * nf;
782 V.auxc.push_back(nf);
783 for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
784 ASSERT(ip[k]+ind_sizes[k] <= ptrend(R->ind[k]));
785 memcpy(q, ip[k], ind_sizes[k] * sizeof(uint8_t));
786 q += ind_sizes[k];
787 ip[k] += ind_sizes[k];
788 V.auxc.push_back(ind_sizes[k]);
789
790 // mb->asb_avg[0] += ind_sizes[k];
791 // mb->asb_avg[1] += MIN(L->hdr->i1 - L->hdr->i0 - k * 256, 256);
792 }
793 ASSERT(q == ptrend(V.t8c));
794
795 vbl_ncols_variance += ((double)(j1-j0)) * ((double)(j1-j0));
796 #if 0
797 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
798 printf(" vbl%u", vblocknum);
799 printf(": %ss %u..%u ", mb->colname, j0, j1);
800 printf("; w %u", V.n);
801 printf("; avg dj %.1f", (j1 - j0) / (double) V.n);
802 if (V.pad) printf("; pad 6*%u", V.pad);
803 printf("\n");
804 }
805 #endif
806 transfer(&(L->vbl), &V);
807 }
808 double vbl_ncols_mean = mb->ncols_t;
809 vbl_ncols_mean /= vblocknum;
810 vbl_ncols_variance /= vblocknum;
811 double vbl_ncols_sdev = sqrt(vbl_ncols_variance - vbl_ncols_mean * vbl_ncols_mean);
812 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
813 " %u vblocks, sdev/avg = %.2f\n", vblocknum, vbl_ncols_sdev / vbl_ncols_mean);
814
815 }
816
817 /* Do the partial transpose. Since we're handling sparse areas, it's
818 * quite a bit faster than doing the full-blown transpose (even though
819 * transposing may be needed anyway because the input matrix is in the
820 * wrong order, it saves us hassles to only view the matrix in one given
821 * way.
822 *
823 * Note though that there _is_ some loss ; when testing, we no longer
824 * have access to the pre-computed transposed file, since we recompute
825 * the transposed parts each time.
826 */
827
do_partial_transpose(builder * mb,vector<uint32_t> & cs,uint32_t i0,uint32_t i1)828 static uint32_t * do_partial_transpose(builder * mb, vector<uint32_t> & cs, uint32_t i0, uint32_t i1) /*{{{*/
829 {
830 uint32_t * cols;
831 uint32_t * qptr;
832
833 uint32_t * ptr = mb->rowhead;
834 for(uint32_t i = i0 ; i < i1 ; i++) {
835 uint32_t w = *ptr++;
836 for( ; w-- ; ) {
837 uint32_t j = *ptr++;
838 cs[j]++;
839 }
840 }
841 cols = new uint32_t[ptr - mb->rowhead - (i1 - i0)];
842 qptr = cols;
843
844 vector<uint32_t *> colptrs;
845 for(uint32_t j = 0 ; j < mb->ncols_t ; j++) {
846 colptrs.push_back(qptr);
847 qptr += cs[j];
848 }
849 ASSERT(qptr-cols == ptr - mb->rowhead - (ptrdiff_t) (i1 - i0));
850 ptr = mb->rowhead;
851 for(uint32_t i = i0 ; i < i1 ; i++) {
852 uint32_t w = *ptr++;
853 for( ; w-- ; ) {
854 uint32_t j = *ptr++;
855 *colptrs[j]++ = i - i0;
856 }
857 }
858 colptrs.clear();
859 mb->rowhead = ptr;
860 return cols;
861 }/*}}}*/
862
builder_do_large_slice(builder * mb,struct large_slice_t * L,uint32_t i0,uint32_t i1,uint32_t imax,unsigned int scratch1size)863 static int builder_do_large_slice(builder * mb, struct large_slice_t * L, uint32_t i0, uint32_t i1, uint32_t imax, unsigned int scratch1size)
864 {
865 memset(L->hdr, 0, sizeof(slice_header_t));
866 L->hdr->t = SLICE_TYPE_LARGE_ENVELOPE;
867 L->hdr->j0 = 0;
868 L->hdr->j1 = mb->ncols_t;
869 L->hdr->i0 = i0;
870 L->hdr->i1 = i1;
871 L->hdr->ncoeffs = 0;
872
873 L->dj_max = 0;
874
875 large_slice_raw_t R[1];
876
877 R->col_sizes.assign(mb->ncols_t, 0);
878 R->pad_sizes.assign(mb->ncols_t, 0);
879
880 uint32_t * ptr0 = mb->rowhead;
881 uint32_t * cols = do_partial_transpose(mb, R->col_sizes, i0, i1);
882 uint32_t * qptr = cols;
883
884 uint32_t last_j = 0;
885
886 /* First we create a huge unique vblock, and later on decide on
887 * how to split it. */
888 for(uint32_t j = 0 ; j < mb->ncols_t ; j++) {
889 uint32_t len = R->col_sizes[j];
890
891 if (!len) continue;
892
893 uint32_t diff = j - last_j;
894 if (diff > L->dj_max) {
895 L->dj_max = diff;
896 }
897 for( ; diff > 255 ; ) {
898 R->main.push_back(255);
899 R->main.push_back(0);
900 R->ind[0].push_back(0);
901 R->main.push_back(0);
902 R->main.push_back(0);
903 R->ind[0].push_back(0);
904 diff -= 255;
905 R->pad_sizes[j] += 1;
906 }
907
908 /* Only the first entry in the column has a non-zero dj. So place
909 * 0 instead inside the loop, and fix it in the end. */
910 R->main.push_back(diff);
911 last_j = j;
912 for( ; len-- ; ) {
913 uint32_t i = *qptr++;
914 uint8_t w = i / 256;
915 R->main.push_back(w);
916 R->main.push_back(0);
917 R->ind[w].push_back((uint8_t) i);
918 L->hdr->ncoeffs ++;
919 }
920 R->main.pop_back();
921 }
922 ASSERT(qptr-cols == mb->rowhead - ptr0 - (ptrdiff_t) (i1 - i0));
923 qptr = NULL;
924 delete[] cols;
925 cols = NULL;
926
927 /* The average dj is quite easy */
928 L->dj_avg = mb->ncols_t / (double) L->hdr->ncoeffs;
929
930 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
931 " w=%" PRIu64 ", avg dj=%.1f, max dj=%u, bucket hit=1/%.1f",
932 L->hdr->ncoeffs, L->dj_avg, L->dj_max, LSL_NBUCKETS_MAX * L->dj_avg);
933
934 if (mb->mm->methods.something_beyond("large")) {
935 /* Then we may enforce our separation criterion */
936 if (L->dj_avg > DJ_CUTOFF2) {
937 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
938 "-> too sparse");
939 if (imax - i0 < HUGE_MPLEX_MIN * LSL_NBUCKETS_MAX * 256) {
940 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
941 "; kept because of short tail\n");
942 } else {
943 /* We won't keep this slice */
944 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD, "\n");
945 mb->rowhead = ptr0;
946 return 0;
947 }
948 }
949 }
950
951 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
952 "\n");
953
954 split_large_slice_in_vblocks(mb, L, R, scratch1size);
955
956 safe_assign(L->hdr->nchildren, L->vbl.size());
957
958 return 1;
959 }
960
961 /* }}} */
962
963 /* {{{ huge slices */
964
965 struct huge_slice_t {
966 slice_header_t hdr[1];
967 double dj_avg;
968 uint32_t dj_max;
969 unsigned int nlarge;
970 list<large_slice_vblock_t> vbl;
971 };
972
973 struct huge_subslice_raw_t {
974 vector<uint8_t> ind[LSL_NBUCKETS_MAX];
975 vector<uint8_t> main;
976 };
977
978 struct huge_subslice_ptrblock_t {
979 uint8_t * ip[LSL_NBUCKETS_MAX];
980 uint8_t * mp;
981 };
982
983 struct huge_slice_raw_t {
984 vector<uint8_t> super;
985 vector<huge_subslice_raw_t> subs;
986 vector<uint32_t> col_sizes;
987 vector<uint32_t> pad_sizes;
988 };
989
split_huge_slice_in_vblocks(builder * mb,huge_slice_t * H,huge_slice_raw_t * R,unsigned int scratch2size)990 static void split_huge_slice_in_vblocks(builder * mb, huge_slice_t * H, huge_slice_raw_t * R, unsigned int scratch2size)/*{{{*/
991 {
992 /* Now split into vslices */
993 // unsigned int lsize = iceildiv(H->hdr->i1 - H->hdr->i0, H->nlarge);
994 uint8_t * sp = ptrbegin(R->super);
995 vector<huge_subslice_ptrblock_t> ptrs(R->subs.size());
996 for(unsigned int i = 0 ; i < R->subs.size() ; i++) {
997 ptrs[i].mp = ptrbegin(R->subs[i].main);
998 for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
999 ptrs[i].ip[k] = ptrbegin(R->subs[i].ind[k]);
1000 }
1001 }
1002 double vbl_ncols_variance = 0;
1003 unsigned int vblocknum = 0;
1004 for(uint32_t j = 0 ; j < mb->ncols_t ; vblocknum++) {
1005 uint32_t j0 = j;
1006 #if 0
1007 /* First way to advance to the next j -- compare the total number
1008 * of coeffs to the scratch2size argument */
1009 size_t n = 0;
1010 size_t np = 0;
1011 for( ; j < mb->ncols_t ; j++) {
1012 size_t dn = R->col_sizes[j];
1013 size_t dnp = R->pad_sizes[j];
1014 if (n + dn + 2 * (np + dnp) > scratch2size) {
1015 /* If dn doesn't fit, then we're in really, really big
1016 * trouble ! */
1017 ASSERT_ALWAYS(n != 0);
1018 break;
1019 }
1020 n += dn;
1021 np += dnp;
1022 }
1023 uint32_t j1 = j;
1024 #else
1025 /* However, there's another way. We might as well arrange so that
1026 * no mega bucket exceeds the L2 cache. */
1027 uint32_t j1 = j;
1028 uint8_t * sp0 = sp;
1029 uint8_t * spc = sp0;
1030 /* XXX Notice that the reader process will consider as a j0 index
1031 * the farthest possible index from the previous vblock, which
1032 * means that we _must_ start with a zero in the main block here
1033 * no matter what -- even if with regard to the way data is set
1034 * up at the upper level (huge unique vblock), we would have had
1035 * a positive integer. This digression does not apply to the
1036 * first vblock. */
1037 if (vblocknum) {
1038 sp0[0]=0;
1039 }
1040 {
1041 vector<unsigned int> Lsizes(H->nlarge, 0);
1042 for( ; j < mb->ncols_t && sp != ptrend(R->super) ; ) {
1043 unsigned int dj = *sp;
1044 j += dj;
1045 if (dj) {
1046 /* beginning of a column. */
1047 spc = sp;
1048 j1 = j;
1049 }
1050 sp++;
1051 unsigned int k = *sp++;
1052 Lsizes[k]++;
1053 if (Lsizes[k] > scratch2size / HUGE_MPLEX_MAX) {
1054 Lsizes[k]--;
1055 sp = spc;
1056 break;
1057 }
1058 }
1059 if (sp == ptrend(R->super)) {
1060 spc = sp;
1061 j1 = mb->ncols_t;
1062 }
1063 }
1064 /* TODO: abandon the n/np distinction. It's useful for debugging
1065 * at best, and it would not hurt to have it displayed only at the
1066 * upper level.
1067 */
1068 size_t n = 0;
1069 size_t np = 0;
1070 for(j = j0 ; j < j1 ; j++) {
1071 n += R->col_sizes[j];
1072 np += R->pad_sizes[j];
1073 }
1074 ASSERT_ALWAYS((n+2*np)*2 == (size_t) (spc - sp0));
1075 sp = sp0;
1076 #endif
1077
1078 /* That's the same type, although it contains more stuff. */
1079 large_slice_vblock_t V;
1080 V.j0 = j0;
1081 V.j1 = j1;
1082 V.auxc.reserve(1 + H->nlarge * (1 + LSL_NBUCKETS_MAX));
1083 V.auxc.push_back(V.j1 - V.j0);
1084 /* First the super block, and then for each contained large
1085 * slice, the main block, then all the bucket blocks. These come
1086 * in order. */
1087 /* We have nf coeffs, counting padding. This makes 2nf bytes in
1088 * super, nf in mains, and nf in the bucket blocks */
1089 V.n = n;
1090 V.pad = np;
1091 size_t nf = n + 2 * np;
1092 V.t8c.resize(4 * nf);
1093 uint8_t * q = ptrbegin(V.t8c);
1094 memcpy(q, sp, 2 * nf * sizeof(uint8_t));
1095 /* TODO: If the solution above is to be kept, then of course we'd
1096 * rather reuse this thing above, since it has already been
1097 * computed.
1098 */
1099 vector<unsigned int> L_sizes(H->nlarge, 0);
1100 q += 2*nf;
1101 for(size_t k = 0 ; k < nf ; k++) {
1102 ASSERT(sp[2 * k + 1] < H->nlarge);
1103 L_sizes[sp[2 * k + 1]]++;
1104 }
1105 sp += 2 * nf;
1106 V.auxc.push_back(nf);
1107 for(unsigned int l = 0 ; l < H->nlarge ; l++) {
1108 V.auxc.push_back(L_sizes[l]);
1109 }
1110 for(unsigned int l = 0 ; l < H->nlarge ; l++) {
1111 unsigned int ind_sizes[LSL_NBUCKETS_MAX] = {0,};
1112 memcpy(q, ptrs[l].mp, L_sizes[l] * sizeof(uint8_t));
1113 q += L_sizes[l];
1114 for(unsigned int k = 0 ; k < L_sizes[l] ; k++) {
1115 ind_sizes[ptrs[l].mp[k]]++;
1116 }
1117 ptrs[l].mp += L_sizes[l];
1118
1119 // unsigned int i_size = MIN(H->hdr->i1 - H->hdr->i0 - l * lsize, lsize);
1120
1121 for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
1122 ASSERT(ptrs[l].ip[k]-(ptrbegin(R->subs[l].ind[k]))+(ptrdiff_t) ind_sizes[k] <= (ptrdiff_t) R->subs[l].ind[k].size());
1123 memcpy(q, ptrs[l].ip[k], ind_sizes[k] * sizeof(uint8_t));
1124 q += ind_sizes[k];
1125 ptrs[l].ip[k] += ind_sizes[k];
1126 V.auxc.push_back(ind_sizes[k]);
1127 // mb->asb_avg[0] += ind_sizes[k];
1128 // mb->asb_avg[1] += MIN(i_size - k * 256, 256);
1129 }
1130 }
1131 ASSERT(q == ptrend(V.t8c));
1132
1133 vbl_ncols_variance += ((double)(j1-j0)) * ((double)(j1-j0));
1134
1135 /* TODO: have some sort of verbosity level */
1136 #if 0
1137 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1138 printf(" vbl%u", vblocknum);
1139 printf(": %ss %u..%u ", mb->colname, j0, j1);
1140 printf("; w %u", V.n);
1141 printf("; avg dj %.1f", (j1 - j0) / (double) V.n);
1142 if (V.pad) printf("; pad 6*%u", V.pad);
1143 printf("\n");
1144 }
1145 #endif
1146
1147 transfer(&(H->vbl), &V);
1148 }
1149 if (!vblocknum) {
1150 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1151 " 0 vblocks\n");
1152 return;
1153 }
1154 double vbl_ncols_mean = mb->ncols_t;
1155 vbl_ncols_mean /= vblocknum;
1156 vbl_ncols_variance /= vblocknum;
1157 double vbl_ncols_sdev = sqrt(vbl_ncols_variance - vbl_ncols_mean * vbl_ncols_mean);
1158 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1159 " %u vblocks, sdev/avg = %.2f\n", vblocknum, vbl_ncols_sdev / vbl_ncols_mean);
1160 }/*}}}*/
1161
builder_do_huge_slice(builder * mb,struct huge_slice_t * H,uint32_t i0,uint32_t i1,unsigned int scratch2size)1162 static int builder_do_huge_slice(builder * mb, struct huge_slice_t * H, uint32_t i0, uint32_t i1, unsigned int scratch2size)
1163 {
1164 memset(H->hdr, 0, sizeof(slice_header_t));
1165 H->hdr->t = SLICE_TYPE_HUGE_ENVELOPE;
1166 H->hdr->j0 = 0;
1167 H->hdr->j1 = mb->ncols_t;
1168 H->hdr->i0 = i0;
1169 H->hdr->i1 = i1;
1170 H->hdr->ncoeffs = 0;
1171
1172 H->dj_max = 0;
1173
1174 huge_slice_raw_t R[1];
1175
1176 R->col_sizes.assign(mb->ncols_t, 0);
1177 R->pad_sizes.assign(mb->ncols_t, 0);
1178
1179 uint32_t * ptr0 = mb->rowhead;
1180 uint32_t * cols = do_partial_transpose(mb, R->col_sizes, i0, i1);
1181 uint32_t * qptr = cols;
1182
1183 /* How many large slices in this huge slice ? */
1184
1185 H->nlarge = iceildiv(i1 - i0, LSL_NBUCKETS_MAX * 256);
1186 ASSERT(H->nlarge >= HUGE_MPLEX_MIN);
1187 ASSERT(H->nlarge <= HUGE_MPLEX_MAX);
1188 unsigned int lsize = iceildiv(i1 - i0, H->nlarge);
1189 ASSERT(lsize <= LSL_NBUCKETS_MAX * 256);
1190 ASSERT(lsize * H->nlarge >= i1 - i0);
1191 R->subs.assign(H->nlarge, huge_subslice_raw_t());
1192
1193 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1194 " (%u*%u) ", H->nlarge, lsize);
1195 uint32_t last_j = 0;
1196
1197 /* First we create a huge unique vblock, and later on decide on
1198 * how to split it. */
1199 uint32_t next_dot = mb->ncols_t / H->nlarge;
1200 for(uint32_t j = 0 ; j < mb->ncols_t ; j++) {
1201 uint32_t len = R->col_sizes[j];
1202
1203 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1204 if (j > next_dot) {
1205 printf(".");
1206 fflush(stdout);
1207 next_dot += mb->ncols_t / H->nlarge;
1208 }
1209 }
1210
1211 if (!len)
1212 continue;
1213
1214 uint32_t diff = j - last_j;
1215 if (diff > H->dj_max) {
1216 H->dj_max = diff;
1217 }
1218 for( ; diff > 255 ; ) {
1219 R->super.push_back(255); // dj
1220 R->super.push_back(0); // k
1221 R->subs[0].main.push_back(0);
1222 R->subs[0].ind[0].push_back(0);
1223 R->super.push_back(0); // dj
1224 R->super.push_back(0); // k
1225 R->subs[0].main.push_back(0);
1226 R->subs[0].ind[0].push_back(0);
1227 diff -= 255;
1228 R->pad_sizes[j] += 1;
1229 }
1230
1231 /* Only the first entry in the column has a non-zero dj. So place
1232 * 0 instead inside the loop, and fix it in the end. */
1233 R->super.push_back(diff);
1234 last_j = j;
1235 for( ; len-- ; ) {
1236 uint32_t i = *qptr++;
1237 uint8_t w = i / lsize;
1238 uint8_t wq = (i % lsize) / 256;
1239 uint8_t wr = i % lsize;
1240 R->super.push_back(w);
1241 R->super.push_back(0);
1242 R->subs[w].main.push_back(wq);
1243 R->subs[w].ind[wq].push_back(wr);
1244 H->hdr->ncoeffs ++;
1245 }
1246 R->super.pop_back();
1247 }
1248 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD, "\n");
1249 ASSERT(qptr-cols == mb->rowhead - ptr0 - (ptrdiff_t) (i1 - i0));
1250 qptr = NULL;
1251 delete[] cols;
1252 cols = NULL;
1253
1254 /* The average dj is quite easy */
1255 H->dj_avg = mb->ncols_t / (double) H->hdr->ncoeffs;
1256
1257 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1258 " w=%" PRIu64 ", avg dj=%.1f, max dj=%u, bucket block hit=1/%.1f\n",
1259 H->hdr->ncoeffs, H->dj_avg, H->dj_max,
1260 H->nlarge * H->dj_avg);
1261
1262 if (0) {
1263 /* Don't keep -- well, here this never occurs */
1264 mb->rowhead = ptr0;
1265 return 0;
1266 }
1267
1268 split_huge_slice_in_vblocks(mb, H, R, scratch2size);
1269
1270 safe_assign(H->hdr->nchildren, H->vbl.size());
1271
1272 return 1;
1273 }
1274
1275 /* }}} */
1276
1277 /* {{{ vertical staircase slices */
1278
1279 struct vsc_step_t {
1280 unsigned int defer;
1281 uint32_t nrows;
1282 unsigned int density_upper_bound;
1283 unsigned long tbuf_space;
1284 };
1285
when_flush(unsigned int k,unsigned int nv,unsigned int defer)1286 static inline unsigned int when_flush(unsigned int k, unsigned int nv, unsigned int defer)
1287 {
1288 k -= k % defer;
1289 k += defer - 1;
1290 k = min(k, nv-1);
1291 return k;
1292 }
1293
flush_here(unsigned int k,unsigned int nv,unsigned int defer)1294 static inline int flush_here(unsigned int k, unsigned int nv, unsigned int defer) {
1295 return k == when_flush(k,nv,defer);
1296 }
1297
1298 struct vsc_sub_slice_t {
1299 slice_header_t hdr[1];
1300 vector<uint16_t> x;
1301 vector<uint8_t> c;
1302 };
1303
1304 struct vsc_middle_slice_t {
1305 slice_header_t hdr[1];
1306 vector<vsc_sub_slice_t> sub;
1307 };
1308
1309 struct vsc_slice_t {
1310 /* This header is mostly a placeholder, and is not here to reflect an
1311 * additional pass on the data. */
1312 slice_header_t hdr[1];
1313 vector<vsc_step_t> steps;
1314 vector<vsc_middle_slice_t> dispatch;
1315 vector<uint32_t> transpose_help;
1316 unsigned long tbuf_space;
1317 };
1318
1319 /* {{{ decide on our flushing periods (this step is data independent) */
flush_periods(unsigned int nvstrips)1320 static vector<unsigned int> flush_periods(unsigned int nvstrips)
1321 {
1322 vector<unsigned int> fp;
1323 for(unsigned int nf = 1 ; ; ) {
1324 unsigned int quo = min(255u, iceildiv(nvstrips, nf));
1325 if (fp.empty() || quo < fp.back()) {
1326 fp.push_back(quo);
1327 }
1328 if (quo == 1)
1329 break;
1330 // next ?
1331 nf = max(nf + 1, (unsigned int) (nf * VSC_BLOCKS_FLUSHFREQ_RATIO));
1332 }
1333 std::reverse(fp.begin(), fp.end());
1334 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1335 printf("Considering cells to be flushed every");
1336 for(unsigned int s = 0 ; s < fp.size() ; s++) {
1337 printf(" %u", fp[s]);
1338 }
1339 printf(" vstrips\n");
1340 }
1341 ASSERT_ALWAYS(fp.back() == min(255u, nvstrips));
1342 ASSERT_ALWAYS(fp.front() == 1);
1343
1344 return fp;
1345 }
1346 /* }}} */
1347
1348 /* {{{ read the row weights */
rowblock_weights(builder * mb,struct vsc_slice_t * V)1349 static vector<unsigned long> rowblock_weights(builder * mb, struct vsc_slice_t * V)
1350 {
1351 vector<unsigned long> blockweight;
1352 uint32_t * ptr = mb->rowhead;
1353 /* It's a bit unfortunate, but since here we do speedy parsing of the
1354 * row weights, we're taking a shortcut which is valid only if our
1355 * data set spans the entire column range */
1356 ASSERT_ALWAYS(V->hdr->j0 == 0 && V->hdr->j1 == mb->ncols_t);
1357 for(uint32_t i = V->hdr->i0 ; i < V->hdr->i1 ; ) {
1358 unsigned long bw = 0;
1359 for(uint32_t di = 0 ; di < VSC_BLOCKS_ROW_BATCH && i < V->hdr->i1 ; i++, di++) {
1360 unsigned int w = *ptr++;
1361 ptr += w;
1362 bw += w;
1363 }
1364 blockweight.push_back(bw);
1365 }
1366 return blockweight;
1367 }
1368 /* }}} */
1369
1370 /*{{{*/
1371 static void
compute_staircase(builder * mb,struct vsc_slice_t * V)1372 compute_staircase(builder * mb, struct vsc_slice_t * V)
1373 {
1374 unsigned int nvstrips = V->dispatch.size();
1375
1376 vector<unsigned long> blockweight = rowblock_weights(mb, V);
1377 vector<unsigned int> flushperiod = flush_periods(nvstrips);
1378
1379 /* {{{ first dispatching pass */
1380 uint32_t ii = V->hdr->i0;
1381 int s = 0;
1382 for(unsigned int k = 0 ; ; k++) {
1383 if (k == blockweight.size() ||
1384 (s < (int) flushperiod.size() - 1 &&
1385 blockweight[k] < VSC_BLOCKS_ROW_BATCH * nvstrips / flushperiod[s]))
1386 {
1387 uint32_t old_ii = ii;
1388 ii = min(V->hdr->i0 + k * VSC_BLOCKS_ROW_BATCH, V->hdr->i1);
1389 uint32_t di = ii - old_ii;
1390 if (di) {
1391 vsc_step_t step;
1392 step.defer = flushperiod[s];
1393 step.nrows = di;
1394 step.tbuf_space = 0; // to be set later.
1395 step.density_upper_bound = (s == 0) ?
1396 UINT_MAX : iceildiv(nvstrips, flushperiod[s-1]);
1397 V->steps.push_back(step);
1398 }
1399 s++;
1400 }
1401 if (k == blockweight.size())
1402 break;
1403 }
1404 /* }}} */
1405 /* {{{ second pass: do some merges for smallish areas */
1406 ii = V->hdr->i0;
1407 for(unsigned int k = 0 ; k < V->steps.size() ; ) {
1408 unsigned long w = V->steps[k].nrows;
1409 uint32_t old_ii = ii;
1410 int merge_with_next = 0;
1411 int merge_with_previous = 0;
1412 if ((k+1) < V->steps.size())
1413 merge_with_next = w <= VSC_BLOCKS_TOO_SMALL_CUTOFF; // || w <= V->steps[k+1].nrows / 10;
1414 if (k > 0)
1415 merge_with_previous = w <= VSC_BLOCKS_TOO_SMALL_CUTOFF; // || w <= V->steps[k-1].nrows / 10;
1416 if (!merge_with_previous && !merge_with_next) {
1417 ii += w;
1418 k++;
1419 continue;
1420 }
1421 /* Otherwise it's a bit ridiculous to have a separate block for
1422 * such a small number of rows. So by default, we choose to merge
1423 * it with the next block, or the previous one if we're reaching
1424 * the end. */
1425 V->steps.erase(V->steps.begin() + k);
1426 if (merge_with_next) {
1427 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1428 "strip %" PRIu32 "+%lu merged with next\n",
1429 old_ii, w);
1430 ASSERT_ALWAYS(k < V->steps.size());
1431 V->steps[k].nrows += w;
1432 // don't increment k here.
1433 // don't increment ii either.
1434 } else {
1435 ASSERT_ALWAYS(merge_with_previous);
1436 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1437 "strip %" PRIu32 "+%lu merged with previous\n",
1438 old_ii, w);
1439 V->steps[k-1].nrows += w;
1440 // don't increment k here.
1441 ii += w;
1442 }
1443 }
1444 /* }}} */
1445 }
1446 /*}}}*/
1447
1448 /*{{{*/
1449 static
vsc_fill_buffers(builder * mb,struct vsc_slice_t * V)1450 void vsc_fill_buffers(builder * mb, struct vsc_slice_t * V)
1451 {
1452 unsigned int nvstrips = V->dispatch.size();
1453 uint32_t width = iceildiv(V->hdr->j1 - V->hdr->j0, nvstrips);
1454 ASSERT_ALWAYS(width > 0);
1455 uint32_t * ptr = mb->rowhead;
1456 uint32_t i = V->hdr->i0;
1457 V->tbuf_space = 0;
1458
1459 for(unsigned int s = 0 ; s < V->steps.size() ; s++) {
1460 /* This strip is scheduled to be flushed every V->steps[s].defer
1461 * vstrips. */
1462 unsigned int defer = V->steps[s].defer;
1463 uint32_t old_ii = i;
1464 uint32_t ii = i + V->steps[s].nrows;
1465 ASSERT_ALWAYS(ii <= V->hdr->i1);
1466 for( ; i < ii ; i++) {
1467 unsigned int w = *ptr++;
1468 for( ; w-- ; ) {
1469 uint32_t j = *ptr++;
1470 unsigned int d = j / width;
1471 V->dispatch[d].sub[s].hdr->ncoeffs++;
1472 V->dispatch[d].sub[s].x.push_back(j % width);
1473 unsigned int fidx = when_flush(d,nvstrips,defer);
1474 V->dispatch[fidx].sub[s].c.push_back(1 + (d % defer));
1475 }
1476 for(unsigned int d = 0 ; d < nvstrips ; d+= defer) {
1477 /* row change */
1478 unsigned int fidx = when_flush(d,nvstrips,defer);
1479 V->dispatch[fidx].sub[s].c.push_back(0);
1480 }
1481 }
1482 /*
1483 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1484 printf("Post counts\n");
1485 for(unsigned int d = 0 ; d < nvstrips ; d++) {
1486 printf(" (nrows=%u) [%u].sub[%u] : %lu coeffs ; xsize=%zu, csize=%zu\n", V->steps[s].nrows, d, s, V->dispatch[d].sub[s].hdr->ncoeffs, V->dispatch[d].sub[s].x.size(), V->dispatch[d].sub[s].c.size());
1487 }
1488 }
1489 */
1490 unsigned int acc = 0;
1491 for(unsigned int d = 0 ; d < nvstrips ; d++) {
1492 acc += V->dispatch[d].sub[s].hdr->ncoeffs;
1493 if (!flush_here(d,nvstrips,defer))
1494 continue;
1495 ASSERT(V->dispatch[d].sub[s].c.size() == acc + V->steps[s].nrows);
1496 acc = 0;
1497 }
1498 unsigned long m = 0;
1499 unsigned long cm = 0;
1500 for(unsigned int d = 0 ; d < nvstrips ; d++) {
1501 if (d % defer == 0) cm = 0;
1502 cm += V->dispatch[d].sub[s].hdr->ncoeffs;
1503 if (cm >= m) m = cm;
1504 }
1505 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1506 printf("Rows %" PRIu32 "+%" PRIu32, old_ii, V->steps[s].nrows);
1507 if (V->steps[s].density_upper_bound != UINT_MAX) {
1508 printf(": d < %u", V->steps[s].density_upper_bound);
1509 }
1510 printf("; %u flushes (every %u), tbuf space %lu\n", iceildiv(nvstrips, defer), defer, m);
1511 }
1512 m += 1; // add space for one dummy pointer.
1513 V->steps[s].tbuf_space = m;
1514 V->tbuf_space += m;
1515 }
1516 for(unsigned int d = 0 ; d < nvstrips ; d++) {
1517 for(unsigned int s = 0 ; s < V->steps.size() ; s++) {
1518 V->dispatch[d].hdr->ncoeffs+=V->dispatch[d].sub[s].hdr->ncoeffs;
1519 }
1520 V->hdr->ncoeffs+=V->dispatch[d].hdr->ncoeffs;
1521 }
1522
1523 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1524 "Total tbuf space %lu (%lu MB)\n",
1525 V->tbuf_space, (V->tbuf_space * sizeof(abelt)) >> 20);
1526 }
1527 /*}}}*/
1528
builder_prepare_vsc_slices(builder * mb,struct vsc_slice_t * V,uint32_t i0,uint32_t imax)1529 static int builder_prepare_vsc_slices(builder * mb, struct vsc_slice_t * V, uint32_t i0, uint32_t imax)
1530 {
1531 memset(V->hdr, 0, sizeof(slice_header_t));
1532 V->hdr->t = SLICE_TYPE_DEFER_ENVELOPE;
1533 V->hdr->i0 = i0;
1534 V->hdr->i1 = imax;
1535 V->hdr->j0 = 0;
1536 V->hdr->j1 = mb->ncols_t;
1537 V->hdr->ncoeffs = 0;
1538
1539 unsigned int nvstrips = iceildiv(V->hdr->j1 - V->hdr->j0, 1 << 16);
1540 safe_assign(V->hdr->nchildren, nvstrips);
1541
1542 uint32_t width = iceildiv(V->hdr->j1 - V->hdr->j0, nvstrips);
1543 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1544 "%u vstrips of width %" PRIu32 "\n", nvstrips, width);
1545
1546 // prepare the dispatch slice headers
1547 for(uint32_t j = V->hdr->j0 ; j < V->hdr->j1 ; j += width) {
1548 vsc_middle_slice_t v;
1549 memset(v.hdr, 0, sizeof(slice_header_t));
1550 v.hdr->t = SLICE_TYPE_DEFER_COLUMN;
1551 v.hdr->i0 = V->hdr->i0;
1552 v.hdr->i1 = V->hdr->i1;
1553 v.hdr->j0 = j;
1554 v.hdr->j1 = min(j + width, V->hdr->j1);
1555 /* ncoeffs is set later, as is the data array x */
1556 v.hdr->ncoeffs = 0;
1557 V->dispatch.push_back(v);
1558 }
1559 ASSERT(V->dispatch.size() == nvstrips);
1560
1561 compute_staircase(mb, V);
1562 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
1563 safe_assign(V->dispatch[k].hdr->nchildren, V->steps.size());
1564 uint32_t i0 = V->hdr->i0;
1565 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1566 vsc_sub_slice_t w;
1567 memset(w.hdr, 0, sizeof(slice_header_t));
1568 w.hdr->t = SLICE_TYPE_DEFER_DIS;
1569 w.hdr->i0 = i0;
1570 w.hdr->i1 = (i0 += V->steps[l].nrows);
1571 w.hdr->j0 = V->dispatch[k].hdr->j0;
1572 w.hdr->j1 = V->dispatch[k].hdr->j1;
1573 w.hdr->ncoeffs = 0;
1574 w.hdr->nchildren = 0;
1575 V->dispatch[k].sub.push_back(w);
1576 }
1577 }
1578 vsc_fill_buffers(mb, V);
1579
1580 return 0;
1581 }
1582
1583 /* }}} */
1584
1585 /*************************************************************************/
1586 /* Pushing slices to mm ; all these routines clear the given slice stack */
1587
1588 /* {{{ small slices */
builder_push_small_slice(struct matmul_bucket_data_s * mm,small_slice_t * S)1589 static void builder_push_small_slice(struct matmul_bucket_data_s * mm, small_slice_t * S)
1590 {
1591 unsigned int ncols_t = mm->public_->dim[!mm->public_->store_transposed];
1592 if (S->is_small2) {
1593 slice_header_t hdr[1];
1594 memset(hdr, 0, sizeof(slice_header_t));
1595 hdr->t = SLICE_TYPE_SMALL2;
1596 hdr->i0 = S->i0;
1597 hdr->i1 = S->i1;
1598 hdr->j0 = 0;
1599 hdr->j1 = ncols_t;
1600 hdr->ncoeffs = S->ncoeffs;
1601 hdr->nchildren = 0;
1602
1603 /* small2 slices are smaller, and faster -- but more
1604 * restrictive. */
1605
1606 /* Because a small2 slice has data in 16bits chunks, we'll
1607 * have to ensure proper alignment in the end. */
1608 ASSERT_ALWAYS((mm->t16.size() & (2-1)) == 0);
1609
1610 unsigned int j = 0;
1611 uint32_t lu_offset = 0;
1612 for( ; ! S->Lu.empty() ; S->Lu.pop_front(), lu_offset+=1<<16) {
1613 small_slice_t::Lv_t lv;
1614 swap(lv, S->Lu.front());
1615 typedef small_slice_t::Lvci_t Lvci_t;
1616 for(Lvci_t lvp = lv.begin() ; lvp != lv.end() ; ++lvp) {
1617 uint32_t dj = (lu_offset + lvp->first) - j;
1618 ASSERT_ALWAYS(dj < (1 << SMALL_SLICES_DJ_BITS) );
1619 ASSERT_ALWAYS(lvp->second < (1 << SMALL_SLICES_I_BITS) );
1620 mm->t16.push_back((dj << SMALL_SLICES_I_BITS) | lvp->second);
1621 j = lu_offset + lvp->first;
1622 }
1623 }
1624
1625 // align.
1626 for( ; (mm->t16.size() & (2-1)) ; )
1627 mm->t16.push_back(0);
1628
1629 mm->headers.push_back(*hdr);
1630 } else {
1631 /* How many vertical parts -- this merely has to do with index
1632 * wraparound, since we're processing data column-major anyway.
1633 */
1634 slice_header_t ehdr[1];
1635 vector<slice_header_t> hdrs;
1636 memset(ehdr, 0, sizeof(slice_header_t));
1637 ehdr->t = SLICE_TYPE_SMALL1;
1638 ehdr->i0 = S->i0;
1639 ehdr->i1 = S->i1;
1640 ehdr->j0 = 0;
1641 ehdr->j1 = ncols_t;
1642 ehdr->ncoeffs = S->ncoeffs;
1643
1644 unsigned int lu_offset = 0;
1645 for( ; ! S->Lu.empty() ; S->Lu.pop_front()) {
1646 small_slice_t::Lv_t lv;
1647 swap(lv, S->Lu.front());
1648 typedef small_slice_t::Lvci_t Lvci_t;
1649
1650 slice_header_t hdr[1];
1651 memset(hdr, 0, sizeof(slice_header_t));
1652 hdr->t = SLICE_TYPE_SMALL1_VBLOCK;
1653 hdr->i0 = S->i0;
1654 hdr->i1 = S->i1;
1655 hdr->j0 = lu_offset;
1656 hdr->nchildren = 0;
1657 hdr->ncoeffs = lv.size();
1658 for(Lvci_t lvp = lv.begin() ; lvp != lv.end() ; ++lvp) {
1659 mm->t16.push_back(lvp->first);
1660 mm->t16.push_back(lvp->second);
1661 }
1662 lu_offset += (1UL << 16);
1663 hdr->j1 = min(lu_offset, ncols_t);
1664 hdrs.push_back(*hdr);
1665 }
1666 safe_assign(ehdr->nchildren, hdrs.size());
1667 mm->headers.push_back(*ehdr);
1668 mm->headers.insert(mm->headers.end(), hdrs.begin(), hdrs.end());
1669 }
1670 mm->public_->ncoeffs += S->ncoeffs;
1671 }
1672
1673 /* }}} */
1674
1675 /* {{{ large slices */
builder_push_large_slice(struct matmul_bucket_data_s * mm,large_slice_t * L)1676 static void builder_push_large_slice(struct matmul_bucket_data_s * mm, large_slice_t * L)
1677 {
1678 mm->headers.push_back(*L->hdr);
1679 for( ; ! L->vbl.empty() ; L->vbl.pop_front()) {
1680 large_slice_vblock_t & V(L->vbl.front());
1681 mm->aux.insert(mm->aux.end(), V.auxc.begin(), V.auxc.end());
1682 unsigned t8_size = V.t8c.size();
1683 mm->t8.insert(mm->t8.end(), V.t8c.begin(), V.t8c.end());
1684 // large slices, but not huge slices, may put an odd number
1685 // of coefficients in t8
1686 if (t8_size & 1) { mm->t8.push_back(0); }
1687 }
1688 mm->public_->ncoeffs += L->hdr->ncoeffs;
1689 }
1690
1691 /* }}} */
1692
1693 /* {{{ huge slices */
1694
builder_push_huge_slice(struct matmul_bucket_data_s * mm,huge_slice_t * H)1695 static void builder_push_huge_slice(struct matmul_bucket_data_s * mm, huge_slice_t * H)
1696 {
1697 mm->headers.push_back(*H->hdr);
1698 mm->aux.push_back(H->nlarge);
1699 for( ; ! H->vbl.empty() ; H->vbl.pop_front()) {
1700 large_slice_vblock_t & V(H->vbl.front());
1701 mm->aux.insert(mm->aux.end(), V.auxc.begin(), V.auxc.end());
1702 unsigned t8_size = V.t8c.size();
1703 ASSERT_ALWAYS((t8_size & 1) == 0);
1704 mm->t8.insert(mm->t8.end(), V.t8c.begin(), V.t8c.end());
1705 }
1706 mm->public_->ncoeffs += H->hdr->ncoeffs;
1707 }
1708 /* }}} */
1709
1710 /******************************************/
1711 /* Iteratively call the building routines */
1712
1713 /* {{{ small slices */
builder_do_all_small_slices(builder * mb,uint32_t * p_i0,uint32_t imax,unsigned int npack)1714 static void builder_do_all_small_slices(builder * mb, uint32_t * p_i0, uint32_t imax, unsigned int npack)
1715 {
1716 /* npack is a guess for the expected size of small slices ; they are
1717 * arranged later to all have approximately equal size.
1718 */
1719 unsigned int s;
1720 uint32_t i00 = *p_i0;
1721 unsigned int nslices = iceildiv(imax - i00, npack);
1722 uint32_t done = 0;
1723 for(s = 0 ; s < nslices ; s++) {
1724 uint32_t i0 = i00 + s * (uint64_t) (imax - i00) / nslices;
1725 uint32_t i1 = i00 + (s+1) * (uint64_t) (imax - i00) / nslices;
1726
1727 small_slice_t S[1];
1728
1729 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1730 printf("Ssl%u: %ss %u+%u...", s, mb->rowname, i0, i1-i0);
1731 fflush(stdout);
1732 }
1733
1734 int keep = builder_do_small_slice(mb, S, i0, i1);
1735
1736 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1737 printf(" w %" PRIu32 " ; avg dj %.1f ; max dj %u%s\n",
1738 S->ncoeffs, S->dj_avg, S->dj_max,
1739 S->is_small2 ? " [packed]" : "");
1740 fflush(stdout);
1741 }
1742
1743 if (!keep) {
1744 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1745 "Switching to large slices. Ssl%u to be redone\n", s);
1746 break;
1747 }
1748 builder_push_small_slice(mb->mm, S);
1749 // transfer(Sq, S);
1750 done += i1 - i0;
1751 }
1752 *p_i0 += done;
1753 }
1754 /* }}} */
1755
1756 /* {{{ large slices */
builder_do_all_large_slices(builder * mb,uint32_t * p_i0,unsigned int imax,unsigned int scratch1size)1757 static void builder_do_all_large_slices(builder * mb, uint32_t * p_i0, unsigned int imax, unsigned int scratch1size)
1758 {
1759 unsigned int rem_nrows = imax - *p_i0;
1760 unsigned int nlarge_slices = iceildiv(rem_nrows, LSL_NBUCKETS_MAX * 256);
1761 uint32_t done = 0;
1762 for(unsigned int s = 0 ; s < nlarge_slices ; s++) {
1763 large_slice_t L[1];
1764 uint32_t i0 = * p_i0 + s * (uint64_t) rem_nrows / nlarge_slices;
1765 uint32_t i1 = * p_i0 + (s + 1) * (uint64_t) rem_nrows / nlarge_slices;
1766
1767 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1768 printf("Lsl%u %ss %u+%u", s, mb->rowname, i0, i1-i0);
1769 fflush(stdout);
1770 }
1771
1772 int keep = builder_do_large_slice(mb, L, i0, i1, imax, scratch1size);
1773
1774 if (!keep) {
1775 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1776 "Switching to huge slices. Lsl%u to be redone\n", s);
1777 break;
1778 }
1779
1780 // transfer(Lq, L);
1781 builder_push_large_slice(mb->mm, L);
1782 done += i1 - i0;
1783 }
1784 *p_i0 += done;
1785 }
1786 /* }}} */
1787
1788 /* {{{ huge slices */
builder_do_all_huge_slices(builder * mb,uint32_t * p_i0,unsigned int imax,unsigned int scratch2size)1789 static void builder_do_all_huge_slices(builder * mb, uint32_t * p_i0, unsigned int imax, unsigned int scratch2size)
1790 {
1791 unsigned int rem_nrows = imax - *p_i0;
1792 unsigned int nhuge_slices = iceildiv(rem_nrows, HUGE_MPLEX_MAX * LSL_NBUCKETS_MAX * 256);
1793 uint32_t done = 0;
1794 for(unsigned int s = 0 ; s < nhuge_slices ; s++) {
1795 huge_slice_t H[1];
1796 uint32_t i0 = * p_i0 + s * (uint64_t) rem_nrows / nhuge_slices;
1797 uint32_t i1 = * p_i0 + (s + 1) * (uint64_t) rem_nrows / nhuge_slices;
1798
1799 if (verbose_enabled(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD)) {
1800 printf("Hsl%u %ss %u+%u", s, mb->rowname, i0, i1-i0);
1801 fflush(stdout);
1802 }
1803 builder_do_huge_slice(mb, H, i0, i1, scratch2size);
1804 builder_push_huge_slice(mb->mm, H);
1805 // transfer(Hq, H);
1806 done += i1 - i0;
1807 }
1808 *p_i0 += done;
1809 }
1810 /* }}} */
1811
1812 #define xxxCOMPRESS_COMBINERS_1
1813 #define xxxCOMPRESS_COMBINERS_2
1814 #define xxxCOMPRESS_COMBINERS_4
1815
1816 #if defined(COMPRESS_COMBINERS_1) || \
1817 defined(COMPRESS_COMBINERS_2) || \
1818 defined(COMPRESS_COMBINERS_4)
1819 #ifdef MATMUL_SUB_VSC_COMBINE_H_
1820 #error "Please either fix or disable the assembly code for COMPRESS_COMBINERS"
1821 /* Anyway this idea doesn't work so well and changes little to the
1822 * running time. */
1823 #endif
1824 #endif
1825
1826
1827 /* {{{ staircase */
compressed_size(unsigned long s,unsigned int defer MAYBE_UNUSED)1828 static unsigned long compressed_size(unsigned long s, unsigned int defer MAYBE_UNUSED)
1829 {
1830 if (0) {
1831 #ifdef COMPRESS_COMBINERS_1
1832 } else if (defer == 1) {
1833 return iceildiv(s, 8);
1834 #endif
1835 #ifdef COMPRESS_COMBINERS_2
1836 } else if (defer <= 3) {
1837 return iceildiv(s, 4);
1838 #endif
1839 #ifdef COMPRESS_COMBINERS_4
1840 } else if (defer <= 15) {
1841 return iceildiv(s, 2);
1842 #endif
1843 }
1844 return s;
1845 }
1846
append_compressed(vector<uint8_t> & t8,vector<uint8_t> const & S,unsigned int defer MAYBE_UNUSED)1847 static void append_compressed(vector<uint8_t>& t8, vector<uint8_t> const& S, unsigned int defer MAYBE_UNUSED)
1848 {
1849 if (0) {
1850 #ifdef COMPRESS_COMBINERS_1
1851 } else if (defer == 1) {
1852 const unsigned int nbits = 1;
1853 unsigned int c = S.size();
1854 t8.reserve(t8.size() + compressed_size(S.size(), defer));
1855 for(unsigned int i = 0 ; i < c ; i += 8 / nbits) {
1856 uint8_t x = 0;
1857 for(unsigned int j = 0 ; nbits * j < 8 && i + j < c ; j++) {
1858 x ^= S[i+j] << (nbits * j);
1859 }
1860 t8.push_back(x);
1861 }
1862 #endif
1863 #ifdef COMPRESS_COMBINERS_2
1864 } else if (defer <= 3) {
1865 const unsigned int nbits = 2;
1866 unsigned int c = S.size();
1867 t8.reserve(t8.size() + compressed_size(S.size(), defer));
1868 for(unsigned int i = 0 ; i < c ; i += 8 / nbits) {
1869 uint8_t x = 0;
1870 for(unsigned int j = 0 ; nbits * j < 8 && i + j < c ; j++) {
1871 x ^= S[i+j] << (nbits * j);
1872 }
1873 t8.push_back(x);
1874 }
1875 #endif
1876 #ifdef COMPRESS_COMBINERS_4
1877 } else if (defer <= 15) {
1878 const unsigned int nbits = 4;
1879 unsigned int c = S.size();
1880 t8.reserve(t8.size() + compressed_size(S.size(), defer));
1881 for(unsigned int i = 0 ; i < c ; i += 8 / nbits) {
1882 uint8_t x = 0;
1883 for(unsigned int j = 0 ; nbits * j < 8 && i + j < c ; j++) {
1884 x ^= S[i+j] << (nbits * j);
1885 }
1886 t8.push_back(x);
1887 }
1888 #endif
1889 } else {
1890 t8.insert(t8.end(), S.begin(), S.end());
1891 }
1892 }
1893
builder_push_vsc_slices(struct matmul_bucket_data_s * mm,vsc_slice_t * V)1894 static void builder_push_vsc_slices(struct matmul_bucket_data_s * mm, vsc_slice_t * V)
1895 {
1896 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
1897 "Flushing staircase slices\n");
1898
1899 mm->headers.push_back(*V->hdr);
1900
1901 mm->aux.push_back(V->dispatch.size());
1902 mm->aux.push_back(V->steps.size());
1903
1904 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1905 mm->aux.push_back(V->steps[l].defer);
1906 mm->aux.push_back(V->steps[l].tbuf_space);
1907 }
1908
1909 /* all headers */
1910 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
1911 mm->headers.push_back(*V->dispatch[k].hdr);
1912 }
1913 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
1914 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1915 mm->headers.push_back(*V->dispatch[k].sub[l].hdr);
1916 }
1917 }
1918
1919 /* We also add header information for the combination operations,
1920 * even though the payload is stored alongside with the dispatching
1921 * stuff for convenience. We do need the combination headers for
1922 * properly keeping track of where the time is spent eventually.
1923 *
1924 * The combining headers are stored en route while flushing
1925 * combination points.
1926 */
1927
1928 unsigned int cidx = mm->headers.size();
1929
1930 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1931 slice_header_t chdr[1];
1932 memset(chdr, 0, sizeof(slice_header_t));
1933 chdr->t = SLICE_TYPE_DEFER_ROW;
1934 chdr->i0 = V->dispatch[0].sub[l].hdr->i0;
1935 chdr->i1 = V->dispatch[0].sub[l].hdr->i1;
1936 chdr->j0 = V->hdr->j0;
1937 chdr->j1 = V->hdr->j1;
1938 chdr->nchildren = 0;
1939 chdr->ncoeffs = 0;
1940 mm->headers.push_back(*chdr);
1941 }
1942
1943 vector<pair<pair<unsigned int, unsigned int>, pair<uint64_t, uint64_t> > > csizes;
1944
1945 // unsigned int o8 = mm->t8.size();
1946
1947 /* dispatch data goes in dispatching order */
1948 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
1949 vsc_middle_slice_t& D(V->dispatch[k]);
1950
1951 /* The stream of u16 values corresponding to this strip */
1952 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1953 vsc_sub_slice_t & S(D.sub[l]);
1954 // printf("save dispatch(%zu), sum=%x\n", S.x.size(), idiotic_sum((void*) ptrbegin(S.x), S.x.size() * sizeof(uint16_t)));
1955 mm->t16.insert(mm->t16.end(), S.x.begin(), S.x.end());
1956 S.x.clear();
1957 }
1958 /* Now the combining data */
1959 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
1960 if (!flush_here(k, V->dispatch.size(), V->steps[l].defer))
1961 continue;
1962 unsigned int k0 = k - (k % V->steps[l].defer);
1963 /* Think about the combining header ! */
1964 slice_header_t chdr[1];
1965 memset(chdr, 0, sizeof(slice_header_t));
1966 chdr->t = SLICE_TYPE_DEFER_CMB;
1967 chdr->i0 = V->dispatch[k0].sub[l].hdr->i0;
1968 chdr->i1 = V->dispatch[k0].sub[l].hdr->i1;
1969 chdr->j0 = V->dispatch[k0].sub[l].hdr->j0;
1970 chdr->j1 = V->dispatch[k].sub[l].hdr->j1;
1971 chdr->nchildren = 0;
1972 chdr->ncoeffs = 0;
1973 /* flush this one */
1974 vsc_sub_slice_t & S(V->dispatch[k].sub[l]);
1975 for( ; k0 <= k ; k0++) {
1976 chdr->ncoeffs += V->dispatch[k0].sub[l].hdr->ncoeffs;
1977 }
1978 unsigned int defer = V->steps[l].defer;
1979
1980 ASSERT_ALWAYS(S.c.size() == chdr->ncoeffs + V->steps[l].nrows);
1981
1982 csizes.push_back(make_pair(
1983 make_pair(k - (k % defer), l),
1984 make_pair(0, S.c.size())));
1985 mm->headers[cidx + l].ncoeffs += chdr->ncoeffs;
1986 // printf("save combine(%zu), sum=%x\n", S.c.size(), idiotic_sum((void*) ptrbegin(S.c), S.c.size()));
1987 // printf("m=%u\n", *max_element(S.c.begin(), S.c.end()));
1988 append_compressed(mm->t8, S.c, defer);
1989 S.c.clear();
1990 mm->headers.push_back(*chdr);
1991 }
1992 }
1993
1994 /* This fixes the reading order w.r.t. the positioning and size of
1995 * the combining slices */
1996 uint64_t p8 = 0;
1997 for(unsigned int i = 0 ; i < csizes.size() ; i++) {
1998 // unsigned int k = csizes[i].first.first;
1999 unsigned int l = csizes[i].first.second;
2000 unsigned long s = csizes[i].second.second;
2001 unsigned int defer = V->steps[l].defer;
2002 unsigned long count = compressed_size(s, defer);
2003 /*
2004 printf("straight-order (defer %u, vstrip #%u)"
2005 ": @%" PRIu64 " combine(%" PRIu64 "), sum=%x\n",
2006 V->steps[csizes[i].first.second].defer, csizes[i].first.first,
2007 csizes[i].second.first, csizes[i].second.second,
2008 idiotic_sum((void*) &*(mm->t8.begin() + o8 + csizes[i].second.first), csizes[i].second.second));
2009 */
2010 csizes[i].second.first = p8;
2011 p8 += count;
2012 }
2013 sort(csizes.begin(), csizes.end());
2014 mm->aux.push_back(2 * csizes.size() + 1);
2015 for(unsigned int i = 0 ; i < csizes.size() ; i++) {
2016 /*
2017 printf("straight-order (defer %u, vstrip #%u)"
2018 ": @%" PRIu64 " combine(%" PRIu64 "), sum=%x\n",
2019 V->steps[csizes[i].first.second].defer, csizes[i].first.first,
2020 csizes[i].second.first, csizes[i].second.second,
2021 idiotic_sum((void*) &*(mm->t8.begin() + o8 + csizes[i].second.first), csizes[i].second.second));
2022 */
2023 mm->aux.push_back(csizes[i].second.first);
2024 mm->aux.push_back(csizes[i].second.second);
2025 }
2026 mm->aux.push_back(p8);
2027
2028 mm->public_->ncoeffs += V->hdr->ncoeffs;
2029 }
2030 /* }}} */
2031
MATMUL_NAME(build_cache)2032 void MATMUL_NAME(build_cache)(matmul_ptr mm0, uint32_t * data, size_t size MAYBE_UNUSED)
2033 {
2034 struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
2035 builder mb[1];
2036 builder_init(mb, mm, data);
2037
2038 verbose_printf(CADO_VERBOSE_PRINT_BWC_CACHE_BUILD,
2039 "%u rows %u cols\n", mm->public_->dim[0], mm->public_->dim[1]);
2040
2041 uint32_t main_i0 = 0;
2042 uint32_t fence = mb->nrows_t;
2043 if (mm->methods.small1 || mm->methods.small2)
2044 builder_do_all_small_slices(mb, &main_i0, fence, mm->npack);
2045
2046 if (mm->methods.large)
2047 builder_do_all_large_slices(mb, &main_i0, fence, mm->scratch1size);
2048
2049 /* Note that vsc and huge are exclusive ! */
2050 if (mm->methods.vsc && main_i0 < fence) {
2051 vsc_slice_t V[1];
2052 builder_prepare_vsc_slices(mb, V, main_i0, fence);
2053 mm->scratch3size = MAX(mm->scratch3size, V->tbuf_space);
2054 builder_push_vsc_slices(mm, V);
2055 main_i0 = fence;
2056 }
2057 if (mm->methods.huge && main_i0 < fence) {
2058 builder_do_all_huge_slices(mb, &main_i0, fence, mm->scratch2size);
2059 }
2060 if (main_i0 < fence) {
2061 fprintf(stderr, "ARGH ! only created a submatrix (%" PRIu32 " < %" PRIu32 ") !!\n", main_i0, fence);
2062 exit(1);
2063 }
2064
2065
2066 /* done, at last ! */
2067
2068 builder_clear(mb);
2069
2070 mm_finish_init(mm);
2071 }
2072
MATMUL_NAME(reload_cache)2073 int MATMUL_NAME(reload_cache)(matmul_ptr mm0)/* {{{ */
2074 {
2075 struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
2076 FILE * f;
2077
2078 f = matmul_common_reload_cache_fopen(sizeof(abelt), mm->public_, MM_MAGIC);
2079 if (!f) return 0;
2080
2081 for( ;; ) {
2082 slice_header_t hdr[1];
2083 MATMUL_COMMON_READ_ONE16(hdr->t, f);
2084 MATMUL_COMMON_READ_ONE16(hdr->nchildren, f);
2085 MATMUL_COMMON_READ_MANY8(hdr->pad, 4, f);
2086 MATMUL_COMMON_READ_ONE32(hdr->i0, f);
2087 MATMUL_COMMON_READ_ONE32(hdr->i1, f);
2088 MATMUL_COMMON_READ_ONE32(hdr->j0, f);
2089 MATMUL_COMMON_READ_ONE32(hdr->j1, f);
2090 MATMUL_COMMON_READ_ONE64(hdr->ncoeffs, f);
2091 if (hdr->t == SLICE_TYPE_NONE)
2092 break;
2093 mm->headers.push_back(*hdr);
2094 }
2095
2096 MATMUL_COMMON_READ_ONE32(mm->scratch1size, f);
2097 MATMUL_COMMON_READ_ONE32(mm->scratch2size, f);
2098 MATMUL_COMMON_READ_ONE32(mm->scratch3size, f);
2099
2100 size_t n16, n8, naux;
2101 MATMUL_COMMON_READ_ONE32(n16, f);
2102 MATMUL_COMMON_READ_ONE32(n8, f);
2103 MATMUL_COMMON_READ_ONE32(naux, f);
2104 mm->t16.resize(n16);
2105 mm->t8.resize(n8);
2106 mm->aux.resize(naux);
2107 MATMUL_COMMON_READ_MANY16(ptrbegin(mm->t16), n16, f);
2108 MATMUL_COMMON_READ_MANY8(ptrbegin(mm->t8), n8, f);
2109 MATMUL_COMMON_READ_MANY32(ptrbegin(mm->aux), naux, f);
2110
2111 fclose(f);
2112
2113 mm_finish_init(mm);
2114
2115 return 1;
2116 }/*}}}*/
2117
MATMUL_NAME(save_cache)2118 void MATMUL_NAME(save_cache)(matmul_ptr mm0)/*{{{*/
2119 {
2120 struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
2121 FILE * f;
2122
2123 f = matmul_common_save_cache_fopen(sizeof(abelt), mm->public_, MM_MAGIC);
2124 if (!f) return;
2125
2126 for(unsigned int h = 0 ; h < mm->headers.size() ; h++) {
2127 slice_header_t * hdr = & (mm->headers[h]);
2128 MATMUL_COMMON_WRITE_ONE16(hdr->t, f);
2129 MATMUL_COMMON_WRITE_ONE16(hdr->nchildren, f);
2130 MATMUL_COMMON_WRITE_MANY8(hdr->pad, 4, f);
2131 MATMUL_COMMON_WRITE_ONE32(hdr->i0, f);
2132 MATMUL_COMMON_WRITE_ONE32(hdr->i1, f);
2133 MATMUL_COMMON_WRITE_ONE32(hdr->j0, f);
2134 MATMUL_COMMON_WRITE_ONE32(hdr->j1, f);
2135 MATMUL_COMMON_WRITE_ONE64(hdr->ncoeffs, f);
2136 }
2137 /* useful padding */
2138 MATMUL_COMMON_WRITE_ONE16(0, f);
2139 MATMUL_COMMON_WRITE_ONE16(0, f);
2140 MATMUL_COMMON_WRITE_ONE32(0, f); // nchildren+padding
2141 MATMUL_COMMON_WRITE_ONE32(0, f);
2142 MATMUL_COMMON_WRITE_ONE32(0, f);
2143 MATMUL_COMMON_WRITE_ONE32(0, f);
2144 MATMUL_COMMON_WRITE_ONE32(0, f);
2145 MATMUL_COMMON_WRITE_ONE64(0, f);
2146
2147 MATMUL_COMMON_WRITE_ONE32(mm->scratch1size, f);
2148 MATMUL_COMMON_WRITE_ONE32(mm->scratch2size, f);
2149 MATMUL_COMMON_WRITE_ONE32(mm->scratch3size, f);
2150 size_t n16 = mm->t16.size();
2151 size_t n8 = mm->t8.size();
2152 size_t naux = mm->aux.size();
2153 MATMUL_COMMON_WRITE_ONE32(n16, f);
2154 MATMUL_COMMON_WRITE_ONE32(n8, f);
2155 MATMUL_COMMON_WRITE_ONE32(naux, f);
2156 MATMUL_COMMON_WRITE_MANY16(ptrbegin(mm->t16), n16, f);
2157 MATMUL_COMMON_WRITE_MANY8(ptrbegin(mm->t8), n8, f);
2158 MATMUL_COMMON_WRITE_MANY32(ptrbegin(mm->aux), naux, f);
2159
2160 fclose(f);
2161 }/*}}}*/
2162
2163 // static inline uint32_t read32(uint16_t const * & q) /* {{{ */
2164 // {
2165 // uint32_t res;
2166 // res = *q++;
2167 // res |= ((uint32_t) *q++) << 16;
2168 // return res;
2169 // } /* }}} */
2170
2171 #ifndef MATMUL_SUB_SMALL1_H_
matmul_sub_small1(abdst_field x,abelt * where,abelt const * from,const uint16_t * q,unsigned int count)2172 static const uint16_t * matmul_sub_small1(abdst_field x, abelt * where, abelt const * from, const uint16_t * q, unsigned int count)
2173 {
2174 for(uint32_t c = 0 ; c < count ; c++) {
2175 uint32_t j = *q++;
2176 uint32_t di = *q++;
2177 // ASSERT(j < (1UL << 16));
2178 // ASSERT(hdr->j0 + j < hdr->j1);
2179 // ASSERT(hdr->i0 + di < hdr->i1);
2180 abadd(x, where[di], where[di], from[j]);
2181 }
2182 return q;
2183 }
2184 #endif
2185
2186 #ifndef MATMUL_SUB_SMALL1_TR_H_
matmul_sub_small1_tr(abdst_field x,abelt * where,abelt const * from,const uint16_t * q,unsigned int count)2187 static const uint16_t * matmul_sub_small1_tr(abdst_field x, abelt * where, abelt const * from, const uint16_t * q, unsigned int count)
2188 {
2189 for(uint32_t c = 0 ; c < count ; c++) {
2190 uint32_t j = *q++;
2191 uint32_t di = *q++;
2192 // ASSERT(j < (1UL << 16));
2193 // ASSERT(hdr->j0 + j < hdr->j1);
2194 // ASSERT(hdr->i0 + di < hdr->i1);
2195 abadd(x, where[j], where[j], from[di]);
2196 }
2197 return q;
2198 }
2199 #endif
2200
2201 #ifndef MATMUL_SUB_SMALL2_H_
matmul_sub_small2(abdst_field x,abelt * where,abelt const * from,const uint16_t * q,unsigned int count)2202 static const uint16_t * matmul_sub_small2(abdst_field x, abelt * where, abelt const * from, const uint16_t * q, unsigned int count)
2203 {
2204 uint32_t j = 0;
2205 for(uint32_t c = 0 ; c < count ; c++) {
2206 uint16_t h = *q++;
2207 j += h >> SMALL_SLICES_I_BITS;
2208 // ASSERT(j < pos->ncols_t);
2209 uint32_t di = h & ((1UL << SMALL_SLICES_I_BITS) - 1);
2210 // ASSERT(hdr->j0 + j < hdr->j1);
2211 // ASSERT(hdr->i0 + di < hdr->i1);
2212 abadd(x, where[di], where[di], from[j]);
2213 }
2214 return q;
2215 }
2216 #endif
2217
2218 #ifndef MATMUL_SUB_SMALL2_TR_H_
matmul_sub_small2_tr(abdst_field x,abelt * where,abelt const * from,const uint16_t * q,unsigned int count)2219 static const uint16_t * matmul_sub_small2_tr(abdst_field x, abelt * where, abelt const * from, const uint16_t * q, unsigned int count)
2220 {
2221 uint32_t j = 0;
2222 for(uint32_t c = 0 ; c < count ; c++) {
2223 uint16_t h = *q++;
2224 j += h >> SMALL_SLICES_I_BITS;
2225 // ASSERT(j < pos->ncols_t);
2226 uint32_t di = h & ((1UL << SMALL_SLICES_I_BITS) - 1);
2227 // ASSERT(hdr->j0 + j < hdr->j1);
2228 // ASSERT(hdr->i0 + di < hdr->i1);
2229 abadd(x, where[j], where[j], from[di]);
2230 }
2231 return q;
2232 }
2233 #endif
2234
2235 struct pos_desc {
2236 const uint16_t * q16;
2237 const uint8_t * q8;
2238 const unsigned int * ql;
2239 uint32_t i;
2240 uint32_t nrows_t;
2241 uint32_t ncols_t;
2242 };
2243
matmul_bucket_mul_small1_vblock(struct matmul_bucket_data_s * mm,slice_header_t * hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2244 static inline void matmul_bucket_mul_small1_vblock(struct matmul_bucket_data_s * mm, slice_header_t * hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2245 {
2246 ASM_COMMENT("multiplication code -- small1 (dense) slices"); /* {{{ */
2247 int usual = d == ! mm->public_->store_transposed;
2248 abdst_field x = mm->xab;
2249 abelt * where = dst + (usual ? hdr->i0 : hdr->j0);
2250 abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2251 if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2252 abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2253 }
2254
2255 uint32_t ncoeffs_slice = hdr->ncoeffs;
2256 ASSERT_ALWAYS(pos->i == hdr->i0); // FIXME -- should disappear.
2257
2258 if (usual) {
2259 pos->q16 = matmul_sub_small1(x, where, from, pos->q16, ncoeffs_slice);
2260 } else {
2261 pos->q16 = matmul_sub_small1_tr(x, where, from, pos->q16, ncoeffs_slice);
2262 }
2263 ASM_COMMENT("end of small1 (dense) slices"); /* }}} */
2264 }
2265
matmul_bucket_mul_small2(struct matmul_bucket_data_s * mm,slice_header_t * hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2266 static inline void matmul_bucket_mul_small2(struct matmul_bucket_data_s * mm, slice_header_t * hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2267 {
2268 ASM_COMMENT("multiplication code -- small2 (dense) slices"); /* {{{ */
2269 int usual = d == ! mm->public_->store_transposed;
2270 abdst_field x = mm->xab;
2271 abelt * where = dst + (usual ? hdr->i0 : hdr->j0);
2272 abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2273
2274 if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2275 abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2276 }
2277
2278 uint32_t ncoeffs_slice = hdr->ncoeffs;
2279 ASSERT_ALWAYS(pos->i == hdr->i0); // FIXME -- should disappear.
2280
2281 if (usual) {
2282 pos->q16 = matmul_sub_small2(x, where, from, pos->q16, ncoeffs_slice);
2283 } else {
2284 pos->q16 = matmul_sub_small2_tr(x, where, from, pos->q16, ncoeffs_slice);
2285 }
2286 /* fix alignment in any case */
2287 pos->q16 += (2 - 1) & - ncoeffs_slice;
2288 ASM_COMMENT("end of small2 (dense) slices"); /* }}} */
2289 }
2290
2291 // static inline void prepare_buckets_and_fences(abdst_field x MAYBE_UNUSED, abelt ** b, abelt ** f, abelt * z, const unsigned int * ql, unsigned int n)
2292 // {
2293 // for(unsigned int k = 0 ; k < n ; k++) {
2294 // b[k] = z;
2295 // f[k] = b[k] + ql[k];
2296 // z += ql[k];
2297 // }
2298 // }
2299
prepare_buckets(abdst_field x MAYBE_UNUSED,abelt ** b,abelt * z,const unsigned int * ql,unsigned int n)2300 static inline void prepare_buckets(abdst_field x MAYBE_UNUSED, abelt ** b, abelt * z, const unsigned int * ql, unsigned int n)
2301 {
2302 for(unsigned int k = 0 ; k < n ; k++) {
2303 b[k] = z;
2304 z += ql[k];
2305 }
2306 }
2307
2308 #ifndef MATMUL_SUB_LARGE_FBI_H_
matmul_sub_large_fbi(abdst_field x,abelt ** sb,const abelt * z,const uint8_t * q,unsigned int n)2309 static inline void matmul_sub_large_fbi(abdst_field x, abelt ** sb, const abelt * z, const uint8_t * q, unsigned int n)
2310 {
2311 /* Dispatch data found in z[0]...z[f(n-1)] such that z[f(i)] is in
2312 * array pointed to by sb[q[2*i+1]]. The function f(i) is given by
2313 * the sum q[0]+q[2]+...+q[2*(i-1)]. Exactly 2n coefficients are
2314 * expected in q[] All the sb[] pointers are increased */
2315 for(unsigned int c = 0 ; c < n ; c++) {
2316 z += *q;
2317 // we might receive zmax and do some checking (see caller)
2318 // ASSERT_ALWAYS(z < zmax);
2319 q++;
2320 abset(x, sb[*q][0], z[0]);
2321 sb[*q]+= 1;
2322 q++;
2323 }
2324 }
2325 #endif
2326
2327 #ifndef MATMUL_SUB_LARGE_FBI_TR_H_
2328 static inline void
matmul_sub_large_fbi_tr(abdst_field x,abelt ** sb,abelt * z,const uint8_t * q,unsigned int n)2329 matmul_sub_large_fbi_tr(abdst_field x, abelt ** sb, abelt * z, const uint8_t * q, unsigned int n)
2330 {
2331 /* Does the converse of the above */
2332 for(unsigned int c = 0 ; c < n ; c++) {
2333 z += *q;
2334 q++;
2335 abadd(x, z[0], z[0], sb[*q][0]);
2336 sb[*q]+= 1;
2337 q++;
2338 }
2339 }
2340 #endif
2341
2342 #ifndef MATMUL_SUB_LARGE_ASB_H_
2343 // static void matmul_sub_large_asb(abdst_field x, abelt * dst, const abelt * z, const uint8_t * q, const unsigned int * ql) __attribute__((__noinline__));
matmul_sub_large_asb(abdst_field x,abelt * dst,const abelt * z,const uint8_t * q,const unsigned int * ql)2344 static void matmul_sub_large_asb(abdst_field x, abelt * dst, const abelt * z, const uint8_t * q, const unsigned int * ql)
2345 {
2346 /* This ``applies'' the LSL_NBUCKETS_MAX small buckets whose
2347 * respective lengths are given by ql[0] to ql[LSL_NBUCKETS_MAX-1].
2348 *
2349 * For ql[k] <= i < ql[k+1], z[i] is added to dst[k*256+qk[i]], with
2350 * qk = q + ql[0] + ... + ql[k-1].
2351 */
2352 for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
2353 unsigned int l = ql[k];
2354 for( ; l-- ; ) {
2355 /* For padding coeffs, the assertion can fail if
2356 * we choose a row not equal to (0,0) -- first in
2357 * the first bucket.
2358 */
2359 abadd(x, dst[*q], dst[*q], z[0]);
2360 z+= 1;
2361 q++;
2362 }
2363 dst += 256;
2364 }
2365 }
2366 #endif
2367
2368 #ifndef MATMUL_SUB_LARGE_ASB_TR_H_
matmul_sub_large_asb_tr(abdst_field x,const abelt * src,abelt * z,const uint8_t * q,const unsigned int * ql)2369 static inline void matmul_sub_large_asb_tr(abdst_field x, const abelt * src, abelt * z, const uint8_t * q, const unsigned int * ql)
2370 {
2371 /* converse of the above */
2372 for(int k = 0 ; k < LSL_NBUCKETS_MAX ; k++) {
2373 unsigned int l = ql[k];
2374 for( ; l-- ; ) {
2375 /* For padding coeffs, the assertion can fail if
2376 * we choose a row not equal to (0,0) -- first in
2377 * the first bucket.
2378 */
2379 abset(x, z[0], src[*q]);
2380 z += 1;
2381 q++;
2382 }
2383 src += 256;
2384 }
2385 }
2386 #endif
2387
2388 #ifndef MATMUL_SUB_LARGE_FBD_H_
2389 static void
matmul_sub_large_fbd(abdst_field x,abelt ** sb,const abelt * z,const uint8_t * q,unsigned int n)2390 matmul_sub_large_fbd(abdst_field x, abelt ** sb, const abelt * z, const uint8_t * q, unsigned int n)
2391 {
2392 /* Dispatch data found in z[0]...z[n] such that z[i] is in array
2393 * pointed to by sb[q[i]]. Exactly n coefficients are expected. All
2394 * the sb[] pointers are increased */
2395 for(unsigned int c = 0 ; c < n ; c++) {
2396 abset(x, sb[q[c]][0], z[0]);
2397 sb[q[c]]+= 1;
2398 z += 1;
2399 }
2400 }
2401 #endif
2402
2403 #ifndef MATMUL_SUB_LARGE_FBD_TR_H_
2404 static inline void
matmul_sub_large_fbd_tr(abdst_field x,abelt ** sb,abelt * z,const uint8_t * q,unsigned int n)2405 matmul_sub_large_fbd_tr(abdst_field x, abelt ** sb, abelt * z, const uint8_t * q, unsigned int n)
2406 {
2407 /* Does the converse of the above */
2408 for(unsigned int c = 0 ; c < n ; c++) {
2409 abset(x, z[0], sb[q[c]][0]);
2410 sb[q[c]]+= 1;
2411 z += 1;
2412 }
2413 }
2414 #endif
2415
matmul_bucket_mul_large(struct matmul_bucket_data_s * mm,slice_header_t * hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2416 static inline void matmul_bucket_mul_large(struct matmul_bucket_data_s * mm, slice_header_t * hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2417 {
2418 ASM_COMMENT("multiplication code -- large (sparse) slices"); /* {{{ */
2419
2420 abdst_field x = mm->xab;
2421
2422 int usual = d == ! mm->public_->store_transposed;
2423
2424 abelt * where = dst + (usual ? hdr->i0 : hdr->j0);
2425 abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2426 if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2427 abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2428 }
2429
2430 abelt * scratch = mm->scratch1;
2431
2432 uint32_t j = 0;
2433
2434 if (usual) {
2435 for( ; j < pos->ncols_t ; ) {
2436 uint32_t j1 = j + *pos->ql++;
2437 uint32_t n = *pos->ql++;
2438 abelt * bucket[LSL_NBUCKETS_MAX];
2439 abelt const * inp = from + j;
2440 prepare_buckets(x,bucket,scratch,pos->ql,LSL_NBUCKETS_MAX);
2441 ASSERT_ALWAYS((((unsigned long)pos->q8)&1)==0);
2442 matmul_sub_large_fbi(x, bucket, inp, pos->q8, n);
2443 // the (inp) variable in the call above never exceeds from +
2444 // j1, although we don't do the check in reality.
2445 // matmul_sub_large_fbi_boundschecked(x, bucket, inp, pos->q8, n, from + j1);
2446 matmul_sub_large_asb(x, where, scratch, pos->q8+2*n, pos->ql);
2447 pos->q8 += 3*n;
2448 // fix alignment !
2449 pos->q8 += n & 1;
2450 /* FIXME. LSL_NBUCKETS_MAX is a compile-time constant, but it
2451 * might differ in the computed cache file !!! (occurs here
2452 * and in several other places as well) */
2453 pos->ql += LSL_NBUCKETS_MAX;
2454 j = j1;
2455 }
2456 } else {
2457 for( ; j < pos->ncols_t ; ) {
2458 uint32_t j1 = j + *pos->ql++;
2459 uint32_t n = *pos->ql++;
2460 abelt * bucket[LSL_NBUCKETS_MAX];
2461 abelt * outp = where + j;
2462 prepare_buckets(x,bucket,scratch,pos->ql,LSL_NBUCKETS_MAX);
2463 matmul_sub_large_asb_tr(x, from, scratch, pos->q8+2*n, pos->ql);
2464 ASSERT_ALWAYS((((unsigned long)pos->q8)&1)==0);
2465 matmul_sub_large_fbi_tr(x, bucket, outp, pos->q8, n);
2466 pos->q8 += 3 * n;
2467 // fix alignment !
2468 pos->q8 += n & 1;
2469 pos->ql += LSL_NBUCKETS_MAX;
2470 j = j1;
2471 }
2472 }
2473 ASM_COMMENT("end of large (sparse) slices"); /* }}} */
2474 }
2475
matmul_bucket_mul_huge(struct matmul_bucket_data_s * mm,slice_header_t * hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2476 static inline void matmul_bucket_mul_huge(struct matmul_bucket_data_s * mm, slice_header_t * hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2477 {
2478 ASM_COMMENT("multiplication code -- huge (very sparse) slices"); /* {{{ */
2479
2480 abdst_field x = mm->xab;
2481
2482 int usual = d == ! mm->public_->store_transposed;
2483
2484 abelt * where = dst + (usual ? hdr->i0 : hdr->j0);
2485 abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2486 if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2487 abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2488 }
2489
2490 abelt * scratch = mm->scratch1;
2491
2492 uint32_t j = 0;
2493
2494 uint32_t di = hdr->i1 - hdr->i0;
2495 unsigned int nlarge = *pos->ql++;
2496 uint32_t di_sub = iceildiv(di, nlarge);
2497
2498 if (usual) {
2499 /* ok -- hold your breath a second. */
2500 for( ; j < pos->ncols_t ; ) {
2501 uint32_t j1 = j + *pos->ql++;
2502 unsigned int n = *pos->ql++;
2503 ASSERT_ALWAYS(n <= mm->scratch2size);
2504 abelt * scratch2 = mm->scratch2;
2505 abelt const * inp = src + j;
2506 abelt * bucket[HUGE_MPLEX_MAX];
2507 const unsigned int * Lsizes = pos->ql;
2508 prepare_buckets(x,bucket,scratch2,pos->ql,nlarge);
2509 pos->ql += nlarge;
2510 ASSERT_ALWAYS((((unsigned long)pos->q8)&1)==0);
2511 matmul_sub_large_fbi(x, bucket, inp, pos->q8, n);
2512 pos->q8 += 2 * n;
2513 for(unsigned int k = 0 ; k < nlarge ; k++) {
2514 abelt * sbucket[LSL_NBUCKETS_MAX];
2515 prepare_buckets(x,sbucket,scratch,pos->ql,LSL_NBUCKETS_MAX);
2516 bucket[k] -= Lsizes[k];
2517 matmul_sub_large_fbd(x, sbucket, bucket[k], pos->q8, Lsizes[k]);
2518 pos->q8 += Lsizes[k];
2519 abelt * outp = where + k * di_sub;
2520 matmul_sub_large_asb(x, outp, scratch, pos->q8, pos->ql);
2521 pos->q8 += Lsizes[k];
2522 pos->ql += LSL_NBUCKETS_MAX;
2523 }
2524 j = j1;
2525 }
2526 } else {
2527 for( ; j < pos->ncols_t ; ) {
2528 uint32_t j1 = j + *pos->ql++;
2529 unsigned int n = *pos->ql++;
2530 abelt * scratch2 = mm->scratch2;
2531 abelt * outp = dst + j;
2532 abelt * bucket[HUGE_MPLEX_MAX];
2533 const unsigned int * Lsizes = pos->ql;
2534 prepare_buckets(x,bucket,scratch2,pos->ql,nlarge);
2535 pos->ql += nlarge;
2536 const uint8_t * q8_saved = pos->q8;
2537 pos->q8 += 2 * n;
2538 for(unsigned int k = 0 ; k < nlarge ; k++) {
2539 abelt * sbucket[LSL_NBUCKETS_MAX];
2540 prepare_buckets(x,sbucket,scratch,pos->ql,LSL_NBUCKETS_MAX);
2541 const uint8_t * fill = pos->q8;
2542 const uint8_t * apply = pos->q8 + Lsizes[k];
2543 const abelt * inp = from + k * di_sub;
2544 matmul_sub_large_asb_tr(x, inp, scratch, apply, pos->ql);
2545
2546 matmul_sub_large_fbd_tr(x, sbucket, bucket[k], fill, Lsizes[k]);
2547
2548 pos->q8 += 2 * Lsizes[k];
2549 pos->ql += LSL_NBUCKETS_MAX;
2550 }
2551 swap(pos->q8, q8_saved);
2552 ASSERT_ALWAYS((((unsigned long)pos->q8)&1)==0);
2553 matmul_sub_large_fbi_tr(x, bucket, outp, pos->q8, n);
2554 pos->q8 += 2 * n;
2555 swap(pos->q8, q8_saved);
2556 j = j1;
2557 }
2558 }
2559 ASM_COMMENT("end of huge (very sparse) slices"); /* }}} */
2560 }
2561
2562 #ifndef MATMUL_SUB_VSC_DISPATCH_H
matmul_sub_vsc_dispatch(abdst_field x,abelt * dst,abelt const * src,const uint16_t * q,unsigned int count)2563 static inline void matmul_sub_vsc_dispatch(abdst_field x, abelt * dst, abelt const * src, const uint16_t * q, unsigned int count)
2564 {
2565 // printf("dispatch(%u), sum=%x\n", count, idiotic_sum((void*)q, count * sizeof(uint16_t)));
2566 for( ; count-- ; ) {
2567 abset(x, dst[0], src[*q++]);
2568 dst += 1;
2569 }
2570 }
2571 #endif
2572
2573 #ifndef MATMUL_SUB_VSC_COMBINE_H_
matmul_sub_vsc_combine(abdst_field x,abelt * dst,const abelt ** mptrs,const uint8_t * q,unsigned int count,unsigned int defer MAYBE_UNUSED)2574 static inline void matmul_sub_vsc_combine(abdst_field x, abelt * dst, const abelt * * mptrs, const uint8_t * q, unsigned int count, unsigned int defer MAYBE_UNUSED)
2575 {
2576 // printf("combine(%u), defer %u\n", count, defer);
2577 // printf("combine(%u), sum=%x\n", count, idiotic_sum((void*)q, compressed_size(count, defer)));
2578 if (0) {
2579 #ifdef COMPRESS_COMBINERS_1
2580 } else if (defer == 1) {
2581 const unsigned int nbits = 1;
2582 for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2583 uint8_t wx = *q++;
2584 for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2585 uint8_t c = wx & ((1 << nbits) - 1);
2586 ASSERT(c <= defer);
2587 wx >>= nbits;
2588 abadd(x, dst[0], dst[0], mptrs[c][0]);
2589 mptrs[c] += c != 0;
2590 dst += c == 0;
2591 }
2592 }
2593 #endif
2594 #ifdef COMPRESS_COMBINERS_2
2595 } else if (defer <= 3) {
2596 const unsigned int nbits = 2;
2597 for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2598 uint8_t wx = *q++;
2599 for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2600 uint8_t c = wx & ((1 << nbits) - 1);
2601 ASSERT(c <= defer);
2602 wx >>= nbits;
2603 abadd(x, dst[0], dst[0], mptrs[c][0]);
2604 mptrs[c] += c != 0;
2605 dst += c == 0;
2606 }
2607 }
2608 #endif
2609 #ifdef COMPRESS_COMBINERS_4
2610 } else if (defer <= 15) {
2611 const unsigned int nbits = 4;
2612 for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2613 uint8_t wx = *q++;
2614 for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2615 uint8_t c = wx & ((1 << nbits) - 1);
2616 ASSERT(c <= defer);
2617 wx >>= nbits;
2618 abadd(x, dst[0], dst[0], mptrs[c][0]);
2619 mptrs[c] += c != 0;
2620 dst += c == 0;
2621 }
2622 }
2623 #endif
2624 } else {
2625 for( ; count-- ; ) {
2626 uint8_t c = *q++;
2627 ASSERT(c <= defer);
2628 abadd(x, dst[0], dst[0], mptrs[c][0]);
2629 mptrs[c] += c != 0;
2630 dst += c == 0;
2631 }
2632 }
2633 }
2634 #endif
2635
2636 #ifndef MATMUL_SUB_VSC_COMBINE_TR_H_
matmul_sub_vsc_combine_tr(abdst_field x,abelt ** mptrs,const abelt * qw,const uint8_t * z,unsigned int count,unsigned int defer MAYBE_UNUSED)2637 static inline void matmul_sub_vsc_combine_tr(abdst_field x, abelt ** mptrs, const abelt * qw, const uint8_t * z, unsigned int count, unsigned int defer MAYBE_UNUSED)
2638 {
2639 // printf("uncombine(%u), defer %u\n", count, defer);
2640 // printf("uncombine(%u), sum=%x\n", count, idiotic_sum((void*)z, compressed_size(count, defer)));
2641 if (0) {
2642 #ifdef COMPRESS_COMBINERS_1
2643 } else if (defer == 1) {
2644 const unsigned int nbits = 1;
2645 for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2646 uint8_t wx = *z++;
2647 for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2648 uint8_t c = wx & ((1 << nbits) - 1);
2649 ASSERT(c <= defer);
2650 wx >>= nbits;
2651 abset(x, mptrs[c][0], qw[0]);
2652 mptrs[c] += c != 0;
2653 qw += c == 0;
2654 }
2655 }
2656 #endif
2657 #ifdef COMPRESS_COMBINERS_2
2658 } else if (defer <= 3) {
2659 const unsigned int nbits = 2;
2660 for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2661 uint8_t wx = *z++;
2662 for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2663 uint8_t c = wx & ((1 << nbits) - 1);
2664 ASSERT(c <= defer);
2665 wx >>= nbits;
2666 abset(x, mptrs[c][0], qw[0]);
2667 mptrs[c] += c != 0;
2668 qw += c == 0;
2669 }
2670 }
2671 #endif
2672 #ifdef COMPRESS_COMBINERS_4
2673 } else if (defer <= 15) {
2674 const unsigned int nbits = 4;
2675 for(unsigned int i = 0 ; i < count ; i += 8 / nbits) {
2676 uint8_t wx = *z++;
2677 for(unsigned int j = 0 ; nbits * j < 8 && i + j < count ; j++) {
2678 uint8_t c = wx & ((1 << nbits) - 1);
2679 ASSERT(c <= defer);
2680 wx >>= nbits;
2681 abset(x, mptrs[c][0], qw[0]);
2682 mptrs[c] += c != 0;
2683 qw += c == 0;
2684 }
2685 }
2686 #endif
2687 } else {
2688 for( ; count-- ; ) {
2689 uint8_t c = *z++;
2690 ASSERT(c <= defer);
2691 abset(x, mptrs[c][0], qw[0]);
2692 mptrs[c] += c != 0;
2693 qw += c == 0;
2694 }
2695 }
2696 }
2697 #endif
2698
2699 #ifndef MATMUL_SUB_VSC_DISPATCH_TR_H_
matmul_sub_vsc_dispatch_tr(abdst_field x,abelt * qr,const abelt * q,const uint16_t * z,unsigned int count)2700 static inline void matmul_sub_vsc_dispatch_tr(abdst_field x, abelt * qr, const abelt * q, const uint16_t * z, unsigned int count)
2701 {
2702 // printf("undispatch(%u), sum=%x\n", count, idiotic_sum((void*)z, count * sizeof(uint16_t)));
2703 for( ; count-- ; ) {
2704 abadd(x, qr[*z], qr[*z], q[0]);
2705 z++;
2706 q += 1;
2707 }
2708 }
2709 #endif
2710
2711 /* in preparation for different steps, including the matrix
2712 * multiplication itself, we are rebuilding the vsc_slice_t object, or at
2713 * least its skeleton ; it makes it easy to run the control loops. The
2714 * real data payload is of course not copied again ! */
2715
rebuild_vsc_slice_skeleton(struct matmul_bucket_data_s * mm,vector<slice_header_t>::iterator & hdr,vsc_slice_t * V,struct pos_desc * pos,unsigned int & nvstrips,unsigned int & Midx,unsigned int & Didx,unsigned int & Ridx,unsigned int & Cidx)2716 static inline void rebuild_vsc_slice_skeleton(
2717 struct matmul_bucket_data_s * mm,
2718 vector<slice_header_t>::iterator & hdr,
2719 vsc_slice_t * V, struct pos_desc * pos,
2720 unsigned int & nvstrips,
2721 unsigned int & Midx,
2722 unsigned int & Didx,
2723 unsigned int & Ridx,
2724 unsigned int & Cidx)
2725 {
2726 *V->hdr = *hdr++;
2727 nvstrips = *pos->ql++;
2728 V->dispatch.resize(nvstrips);
2729 V->steps.resize(*pos->ql++);
2730 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2731 V->steps[l].defer = *pos->ql++;
2732 V->steps[l].tbuf_space = *pos->ql++;
2733 }
2734
2735 Midx = hdr - mm->headers.begin();
2736 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2737 vsc_middle_slice_t &D (V->dispatch[k]);
2738 *D.hdr = *hdr++;
2739 }
2740 Didx = hdr - mm->headers.begin();
2741 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2742 vsc_middle_slice_t &D (V->dispatch[k]);
2743 D.sub.resize(V->steps.size());
2744 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2745 vsc_sub_slice_t & S(D.sub[l]);
2746 *S.hdr = *hdr++;
2747 V->steps[l].nrows = S.hdr->i1 - S.hdr->i0;
2748 }
2749 }
2750 /* Skip over the combining headers as well */
2751 Ridx = hdr - mm->headers.begin();
2752 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2753 hdr++;
2754 }
2755 Cidx = hdr - mm->headers.begin();
2756 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2757 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2758 if (!flush_here(k, V->dispatch.size(), V->steps[l].defer))
2759 continue;
2760 hdr++;
2761 }
2762 }
2763 }
matmul_bucket_mul_vsc(struct matmul_bucket_data_s * mm,vector<slice_header_t>::iterator & hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)2764 static inline void matmul_bucket_mul_vsc(struct matmul_bucket_data_s * mm, vector<slice_header_t>::iterator & hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
2765 {
2766 abdst_field x = mm->xab;
2767
2768 int usual = d == ! mm->public_->store_transposed;
2769
2770 abelt * where = dst + (usual ? hdr->i0 : hdr->j0);
2771 // abelt const * from = src + (usual ? hdr->j0 : hdr->i0);
2772 if ((usual ? hdr->j0 : hdr->i0) == 0) { /* first to come, first to clear */
2773 abvec_set_zero(x, where, usual ? (hdr->i1 - hdr->i0) : (hdr->j1 - hdr->j0));
2774 }
2775
2776 abelt * scratch = mm->scratch3;
2777
2778 /* {{{ */
2779
2780 vsc_slice_t V[1];
2781 unsigned int nvstrips;
2782 unsigned int Midx;
2783 unsigned int Didx;
2784 unsigned int Ridx;
2785 unsigned int Cidx;
2786 rebuild_vsc_slice_skeleton(mm, hdr, V, pos, nvstrips, Midx, Didx, Ridx, Cidx);
2787
2788 /*}}}*/
2789
2790 ASM_COMMENT("multiplication code -- vertical staircase");/*{{{*/
2791
2792 /* now prepare pointers */
2793 vector<abelt *> base_ptrs;
2794 vector<abelt *> cptrs;
2795 abelt * q0 = scratch;
2796 abelt * dummy = q0;
2797 abvec_set_zero(x, dummy, 1);
2798 q0++;
2799 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2800 base_ptrs.push_back(q0);
2801 cptrs.push_back(q0);
2802 q0 += V->steps[l].tbuf_space;
2803 }
2804 base_ptrs.push_back(q0);
2805
2806 if (usual) {
2807 uint32_t skipover = *pos->ql++;
2808 pos->ql += skipover;
2809 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2810 vsc_middle_slice_t const & D(V->dispatch[k]);
2811 const abelt * qr = src + D.hdr->j0;
2812 mm->slice_timings[Midx].t -= wct_seconds();
2813 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2814 vsc_sub_slice_t const & S(D.sub[l]);
2815 abelt * q = cptrs[l];
2816 unsigned int count = S.hdr->ncoeffs;
2817 ASSERT(base_ptrs[l] <= q);
2818 ASSERT(q <= base_ptrs[l+1]);
2819 mm->slice_timings[Didx].t -= wct_seconds();
2820 matmul_sub_vsc_dispatch(x, q, qr, pos->q16, count);
2821 q += count;
2822 pos->q16 += count;
2823 mm->slice_timings[Didx].t += wct_seconds();
2824 Didx++;
2825 ASSERT(q <= base_ptrs[l+1]);
2826 cptrs[l] = q;
2827 }
2828 mm->slice_timings[Midx].t += wct_seconds();
2829 Midx++;
2830
2831 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2832 vsc_sub_slice_t const & S(D.sub[l]);
2833 unsigned int defer = V->steps[l].defer;
2834 if (!flush_here(k,nvstrips,defer))
2835 continue;
2836
2837 /* our different read pointers */
2838 vector<abelt const *> mptrs;
2839 mptrs.reserve(defer + 1);
2840 abelt const * q = base_ptrs[l];
2841 mptrs.push_back(dummy);
2842 for(unsigned int k0 = k - k % defer ; k0 <= k ; k0++) {
2843 mptrs.push_back(q);
2844 q += V->dispatch[k0].sub[l].hdr->ncoeffs;
2845 }
2846
2847 abelt * qw = dst + S.hdr->i0;
2848 unsigned int count = mm->headers[Cidx].ncoeffs + V->steps[l].nrows;
2849 ASSERT(V->steps[l].nrows == S.hdr->i1 - S.hdr->i0);
2850 ASSERT(q - base_ptrs[l] == (ptrdiff_t) mm->headers[Cidx].ncoeffs);
2851
2852 double t = wct_seconds();
2853 mm->slice_timings[Cidx].t -= t;
2854 mm->slice_timings[Ridx+l].t -= t;
2855 matmul_sub_vsc_combine(x, qw, ptrbegin(mptrs), pos->q8, count, defer);
2856 pos->q8 += compressed_size(count, defer);
2857 t = wct_seconds();
2858 mm->slice_timings[Cidx].t += t;
2859 mm->slice_timings[Ridx+l].t += t;
2860 Cidx++;
2861 cptrs[l]=base_ptrs[l];
2862 }
2863 }
2864 ASSERT((ptrdiff_t) Cidx == hdr - mm->headers.begin());
2865 } else {
2866 /* There's quite an annoying difficulty here. Combining buffers
2867 * are not read in the same order here, because ``combining'' (whose
2868 * meaning turns into actually dispatching) occurs of course
2869 * _earlier_ in the operation.
2870 */
2871 pos->ql++; // only the count, for fast skipover.
2872 for(unsigned int k = 0 ; k < V->dispatch.size() ; k++) {
2873 vsc_middle_slice_t const & D(V->dispatch[k]);
2874 abelt * qr = dst + D.hdr->j0;
2875 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2876 vsc_sub_slice_t const & S(D.sub[l]);
2877 unsigned int defer = V->steps[l].defer;
2878
2879 /* Here the test is not the same. We have to fill the
2880 * scratch buffers ahead of time. */
2881 if (k % defer)
2882 continue;
2883
2884 /* our different _write_ pointers */
2885 vector<abelt *> mptrs;
2886 mptrs.reserve(defer + 1);
2887 abelt * q = base_ptrs[l];
2888 cptrs[l] = q;
2889 mptrs.push_back(dummy);
2890 for(unsigned int k0 = k ; k0 <= when_flush(k,nvstrips,defer) ; k0++) {
2891 mptrs.push_back(q);
2892 q += V->dispatch[k0].sub[l].hdr->ncoeffs;
2893 }
2894
2895 const abelt * qw = src + S.hdr->i0;
2896 // unsigned int count = mm->headers[Cidx].ncoeffs + V->steps[l].nrows;
2897 const uint8_t * z = pos->q8 + *pos->ql++;
2898 unsigned int count = *pos->ql++;
2899
2900 double t = wct_seconds();
2901 // mm->slice_timings[Cidx].t -= t;
2902 mm->slice_timings[Ridx+l].t -= t;
2903 matmul_sub_vsc_combine_tr(x, ptrbegin(mptrs), qw, z, count, defer);
2904 z += compressed_size(count, defer);
2905 t = wct_seconds();
2906 // mm->slice_timings[Cidx].t += t;
2907 mm->slice_timings[Ridx+l].t += t;
2908 // Cidx++;
2909 }
2910
2911 mm->slice_timings[Midx].t -= wct_seconds();
2912 for(unsigned int l = 0 ; l < V->steps.size() ; l++) {
2913 vsc_sub_slice_t const & S(D.sub[l]);
2914 abelt * q = cptrs[l];
2915 unsigned int count = S.hdr->ncoeffs;
2916 ASSERT(base_ptrs[l] <= q);
2917 ASSERT(q <= base_ptrs[l+1]);
2918 mm->slice_timings[Didx].t -= wct_seconds();
2919 matmul_sub_vsc_dispatch_tr(x, qr, q, pos->q16, count);
2920 cptrs[l] += count;
2921 pos->q16 += count;
2922 mm->slice_timings[Didx].t += wct_seconds();
2923 Didx++;
2924 ASSERT(q <= base_ptrs[l+1]);
2925 }
2926 mm->slice_timings[Midx].t += wct_seconds();
2927 Midx++;
2928 }
2929 // ASSERT(Cidx == hdr - mm->headers.begin());
2930 pos->q8 += *pos->ql++;
2931 }
2932
2933 ASM_COMMENT("end of vertical staircase");/*}}}*/
2934
2935 /* It's an envelope type, so make sure the iterator is properly
2936 * placed eventually */
2937 hdr--;
2938 }
2939
2940 ///////////////////////////////////////////////////////////////////////
2941 // just count how many times an iteration schedules a coefficient in
2942 // the fbi/fbd/asb routines.
2943
mm_finish_init(struct matmul_bucket_data_s * mm)2944 static inline void mm_finish_init(struct matmul_bucket_data_s * mm)
2945 {
2946 struct pos_desc pos[1];
2947
2948 pos->q16 = ptrbegin(mm->t16);
2949 pos->q8 = ptrbegin(mm->t8);
2950 pos->ql = ptrbegin(mm->aux);
2951 pos->i = 0;
2952 pos->nrows_t = mm->public_->dim[ mm->public_->store_transposed];
2953 pos->ncols_t = mm->public_->dim[!mm->public_->store_transposed];
2954
2955 /*
2956 for(uint16_t h = 0 ; h < mm->headers.size() ; h++) {
2957 slice_header_t * hdr = & (mm->headers[h]);
2958 printf("block %d %s [%d..%d[ x [%d..%d[, %d cld, %" PRIu64 " coeffs\n",
2959 h, slice_name(hdr->t),
2960 hdr->i0, hdr->i1,
2961 hdr->j0, hdr->j1,
2962 hdr->nchildren, hdr->ncoeffs);
2963 }
2964 */
2965 for(vector<slice_header_t>::iterator hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
2966 switch(hdr->t) {
2967 case SLICE_TYPE_SMALL1:
2968 break;
2969 case SLICE_TYPE_SMALL1_VBLOCK:
2970 // mm->mms1_ncoeffs += hdr->ncoeffs;
2971 pos->q16 += 2 * hdr->ncoeffs;
2972 break;
2973 case SLICE_TYPE_SMALL2:
2974 pos->q16 += hdr->ncoeffs;
2975 pos->q16 += (2 - 1) & - hdr->ncoeffs;
2976 // mm->mms2_ncoeffs += hdr->ncoeffs;
2977 break;
2978 case SLICE_TYPE_LARGE_ENVELOPE:
2979 {
2980 uint32_t j = 0;
2981 for( ; j < pos->ncols_t ; ) {
2982 uint32_t j1 = j + *pos->ql++;
2983 uint32_t n = *pos->ql++;
2984 // mm->fbi_ncoeffs += n;
2985 // mm->asb_ncoeffs += n;
2986 pos->q8 += 3*n;
2987 pos->q8 += n & 1;
2988 pos->ql += LSL_NBUCKETS_MAX;
2989 j = j1;
2990 }
2991 }
2992 break;
2993 case SLICE_TYPE_HUGE_ENVELOPE:
2994 {
2995 uint32_t j = 0;
2996 unsigned int nlarge = *pos->ql++;
2997 for( ; j < pos->ncols_t ; ) {
2998 uint32_t j1 = j + *pos->ql++;
2999 unsigned int n = *pos->ql++;
3000 // mm->fbi_ncoeffs += n;
3001 // mm->fbd_ncoeffs += n;
3002 // mm->asb_ncoeffs += n;
3003 pos->ql += nlarge;
3004 pos->ql += nlarge * LSL_NBUCKETS_MAX;
3005 pos->q8 += 4*n;
3006 j = j1;
3007 }
3008 }
3009 break;
3010 case SLICE_TYPE_DEFER_ENVELOPE:
3011 {
3012 vsc_slice_t V[1];
3013 unsigned int nvstrips;
3014 unsigned int Midx;
3015 unsigned int Didx;
3016 unsigned int Ridx;
3017 unsigned int Cidx;
3018 rebuild_vsc_slice_skeleton(mm, hdr, V, pos, nvstrips, Midx, Didx, Ridx, Cidx);
3019 hdr--;
3020 /*
3021 for(unsigned int s = 0 ; s < V->steps.size() ; s++) {
3022 unsigned int defer = V->steps[s].defer;
3023 printf("Rows %" PRIu32 "+%" PRIu32, mm->headers[Ridx++].i0, V->steps[s].nrows);
3024 if (V->steps[s].density_upper_bound != UINT_MAX) {
3025 printf(": d < %u", V->steps[s].density_upper_bound);
3026 }
3027 printf("; %u flushes (every %u), tbuf space %lu\n", iceildiv(nvstrips, defer), defer, V->steps[s].tbuf_space);
3028 }
3029 */
3030 }
3031 break;
3032 default:
3033 fprintf(stderr, "Unexpected block %s encountered\n",
3034 slice_name(hdr->t));
3035 break;
3036 }
3037 if (hdr->j1 == pos->ncols_t) {
3038 pos->i = hdr->i1;
3039 }
3040 }
3041
3042 #if 0
3043 for(uint16_t h = 0 ; h < mm->headers.size() ; h++) {
3044 slice_header_t * hdr = & (mm->headers[h]);
3045 ASSERT_ALWAYS(pos->i == hdr->i0);
3046 switch(hdr->t) {
3047 case SLICE_TYPE_SMALL1_VBLOCK:
3048 mm->mms1_ncoeffs += hdr->ncoeffs;
3049 pos->q16 += 2 * hdr->ncoeffs;
3050 break;
3051 case SLICE_TYPE_SMALL2:
3052 pos->q16 += hdr->ncoeffs;
3053 pos->q16 += (2 - 1) & - hdr->ncoeffs;
3054 mm->mms2_ncoeffs += hdr->ncoeffs;
3055 break;
3056 case SLICE_TYPE_LARGE_ENVELOPE:
3057 {
3058 uint32_t j = 0;
3059 for( ; j < pos->ncols_t ; ) {
3060 uint32_t j1 = j + *pos->ql++;
3061 uint32_t n = *pos->ql++;
3062 mm->fbi_ncoeffs += n;
3063 mm->asb_ncoeffs += n;
3064 // pos->q8 += 3*n;
3065 pos->ql += LSL_NBUCKETS_MAX;
3066 j = j1;
3067 }
3068 }
3069 break;
3070 case SLICE_TYPE_HUGE_ENVELOPE:
3071 {
3072 uint32_t j = 0;
3073 unsigned int nlarge = *pos->ql++;
3074 for( ; j < pos->ncols_t ; ) {
3075 uint32_t j1 = j + *pos->ql++;
3076 unsigned int n = *pos->ql++;
3077 mm->fbi_ncoeffs += n;
3078 mm->fbd_ncoeffs += n;
3079 mm->asb_ncoeffs += n;
3080 pos->ql += nlarge;
3081 pos->ql += nlarge * LSL_NBUCKETS_MAX;
3082 // pos->q8 += 4*n;
3083 j = j1;
3084 }
3085 }
3086 case SLICE_TYPE_DEFER_ENVELOPE:
3087 default:
3088 break;
3089 }
3090 if (hdr->j1 == pos->ncols_t) {
3091 pos->i = hdr->i1;
3092 }
3093 }
3094 #endif
3095
3096 abvec_init(mm->xab, &mm->scratch1, mm->scratch1size);
3097 abvec_init(mm->xab, &mm->scratch2, mm->scratch2size);
3098 abvec_init(mm->xab, &mm->scratch3, mm->scratch3size);
3099
3100 mm->slice_timings.resize(mm->headers.size());
3101 }
3102
matmul_bucket_zero_stats(struct matmul_bucket_data_s * mm)3103 static void matmul_bucket_zero_stats(struct matmul_bucket_data_s * mm)
3104 {
3105 vector<slice_header_t>::iterator hdr;
3106 for(hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
3107 unsigned int hidx = hdr - mm->headers.begin();
3108 mm->slice_timings[hidx].t = 0;
3109 }
3110 }
3111
matmul_bucket_mul_small1(struct matmul_bucket_data_s * mm,vector<slice_header_t>::iterator & hdr,abelt * dst,abelt const * src,int d,struct pos_desc * pos)3112 static inline void matmul_bucket_mul_small1(struct matmul_bucket_data_s * mm, vector<slice_header_t>::iterator & hdr, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
3113 {
3114 slice_header_t & h(*hdr++);
3115 for(unsigned int i = 0 ; i < h.nchildren ; i++, hdr++) {
3116 ASSERT(hdr != mm->headers.end());
3117 ASSERT(hdr->t == SLICE_TYPE_SMALL1_VBLOCK);
3118 mm->slice_timings[hdr - mm->headers.begin()].t -= wct_seconds();
3119 matmul_bucket_mul_small1_vblock(mm, &*hdr, dst, src, d, pos);
3120 mm->slice_timings[hdr - mm->headers.begin()].t += wct_seconds();
3121 }
3122 hdr--;
3123 }
3124
matmul_bucket_mul_loop(struct matmul_bucket_data_s * mm,abelt * dst,abelt const * src,int d,struct pos_desc * pos)3125 static inline void matmul_bucket_mul_loop(struct matmul_bucket_data_s * mm, abelt * dst, abelt const * src, int d, struct pos_desc * pos)
3126 {
3127 vector<slice_header_t>::iterator hdr;
3128
3129 for(hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
3130 unsigned int hidx = hdr - mm->headers.begin();
3131 mm->slice_timings[hidx].t -= wct_seconds();
3132 switch(hdr->t) {
3133 case SLICE_TYPE_SMALL2:
3134 matmul_bucket_mul_small2(mm, &*hdr, dst, src, d, pos);
3135 break;
3136 case SLICE_TYPE_SMALL1:
3137 matmul_bucket_mul_small1(mm, hdr, dst, src, d, pos);
3138 break;
3139 case SLICE_TYPE_LARGE_ENVELOPE:
3140 matmul_bucket_mul_large(mm, &*hdr, dst, src, d, pos);
3141 break;
3142 case SLICE_TYPE_HUGE_ENVELOPE:
3143 matmul_bucket_mul_huge(mm, &*hdr, dst, src, d, pos);
3144 break;
3145 case SLICE_TYPE_DEFER_ENVELOPE:
3146 matmul_bucket_mul_vsc(mm, hdr, dst, src, d, pos);
3147 break;
3148 /* some slice types are ``contained'', and we should never
3149 * see them. NOTE that this implies in particular that the
3150 * timings for type "defer" (DEFER_ENVELOPE) is counted here,
3151 * thus including all overhead, while the timing for the
3152 * inner objects is counted from within the subroutines. */
3153 case SLICE_TYPE_SMALL1_VBLOCK:
3154 case SLICE_TYPE_DEFER_COLUMN:
3155 case SLICE_TYPE_DEFER_ROW:
3156 case SLICE_TYPE_DEFER_CMB:
3157 case SLICE_TYPE_DEFER_DIS:
3158 break;
3159 /* The current implementation no longer accepts data not
3160 * obeying the header structure, so the default branch also
3161 * aborts. */
3162 default:
3163 fprintf(stderr, "Bogus slice type seen: %d\n", hdr->t);
3164 ASSERT_ALWAYS(0);
3165 }
3166 if (hdr->j1 == pos->ncols_t) { pos->i = hdr->i1; }
3167 mm->slice_timings[hidx].t += wct_seconds();
3168 }
3169 }
3170
MATMUL_NAME(mul)3171 void MATMUL_NAME(mul)(matmul_ptr mm0, void * xdst, void const * xsrc, int d)
3172 {
3173 struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
3174 struct pos_desc pos[1];
3175
3176 abdst_vec dst = (abdst_vec) xdst;
3177 absrc_vec src = (absrc_vec) const_cast<void*>(xsrc);
3178 pos->q16 = ptrbegin(mm->t16);
3179 pos->q8 = ptrbegin(mm->t8);
3180 pos->ql = ptrbegin(mm->aux);
3181 pos->i = 0;
3182 pos->nrows_t = mm->public_->dim[ mm->public_->store_transposed];
3183 pos->ncols_t = mm->public_->dim[!mm->public_->store_transposed];
3184
3185 abdst_field x = mm->xab;
3186
3187 mm->main_timing.t -= wct_seconds();
3188
3189 if (d == !mm->public_->store_transposed) {
3190 /* This is the ``normal'' case (matrix times vector). */
3191 } else {
3192 /* d == mm->public_->store_transposed */
3193 /* BEWARE, it's a priori sub-optimal ! In practice, the
3194 * difference isn't so striking though. */
3195 if (mm->public_->iteration[d] == 10) {
3196 fprintf(stderr, "Warning: Doing many iterations with bad code\n");
3197 }
3198 /* We zero out the dst area beforehand */
3199 abvec_set_zero(x, dst, pos->ncols_t);
3200 }
3201 matmul_bucket_mul_loop(mm, dst, src, d, pos);
3202
3203 mm->main_timing.t += wct_seconds();
3204
3205 mm->public_->iteration[d]++;
3206 }
3207
matmul_bucket_report_vsc(std::ostream & os,struct matmul_bucket_data_s * mm,double scale,vector<slice_header_t>::iterator & hdr,double * p_t_total)3208 static std::ostream& matmul_bucket_report_vsc(std::ostream& os, struct matmul_bucket_data_s * mm, double scale, vector<slice_header_t>::iterator & hdr, double * p_t_total)
3209 {
3210 uint64_t scale0;
3211 scale0 = (mm->public_->iteration[0] + mm->public_->iteration[1]);
3212 unsigned int nvstrips = hdr->nchildren;
3213 hdr++;
3214 unsigned int nsteps = hdr->nchildren;
3215 vector<pair<uint64_t, double> > dtime(nsteps);
3216 vector<pair<uint64_t, double> > ctime(nsteps);
3217 for(unsigned int k = 0 ; k < nvstrips ; k++) {
3218 ASSERT_ALWAYS(hdr->t == SLICE_TYPE_DEFER_COLUMN);
3219 hdr++;
3220 }
3221 // hdr+=nvstrips;
3222 for(unsigned int k = 0 ; k < nvstrips ; k++) {
3223 for(unsigned int l = 0 ; l < nsteps ; l++) {
3224 ASSERT_ALWAYS(hdr->t == SLICE_TYPE_DEFER_DIS);
3225 double t = mm->slice_timings[hdr - mm->headers.begin()].t;
3226 uint64_t nc = hdr->ncoeffs;
3227 dtime[l].first += nc;
3228 dtime[l].second += t;
3229 hdr++;
3230 }
3231 }
3232 double total_from_defer_rows = 0;
3233 double total_from_defer_cmbs = 0;
3234 for(unsigned int l = 0 ; l < nsteps ; l++) {
3235 ASSERT_ALWAYS(hdr->t == SLICE_TYPE_DEFER_ROW);
3236 double t = mm->slice_timings[hdr - mm->headers.begin()].t;
3237 uint64_t nc = hdr->ncoeffs;
3238 ctime[l].first += nc;
3239 ctime[l].second += t;
3240 total_from_defer_rows+=t;
3241 hdr++;
3242 }
3243 /* Skip the combining blocks, because they're accounted for already
3244 * by the row blocks */
3245 for( ; hdr != mm->headers.end() && hdr->t == SLICE_TYPE_DEFER_CMB ; hdr++) {
3246 double t = mm->slice_timings[hdr - mm->headers.begin()].t;
3247 total_from_defer_cmbs+=t;
3248 }
3249 /* Some jitter will appear if transposed mults are performed, because
3250 * for the moment transposed mults don't properly store timing info
3251 */
3252 /*
3253 printf("jitter %.2f - %.2f = %.2f\n",
3254 total_from_defer_rows / scale0, total_from_defer_cmbs / scale0,
3255 (total_from_defer_rows - total_from_defer_cmbs) / scale0);
3256 */
3257 for(unsigned int l = 0 ; l < nsteps ; l++) {
3258 double t;
3259 uint64_t nc;
3260 double a;
3261 nc = dtime[l].first;
3262 t = dtime[l].second / scale0;
3263 a = 1.0e9 * t / nc;
3264 *p_t_total += t;
3265 os << fmt::sprintf("defer\t%.2fs ; n=%-9" PRIu64 " ; %5.2f ns/c ;"
3266 " scaled*%.2f : %5.2f/c\n",
3267 t, nc, a, scale, a * scale);
3268 nc = ctime[l].first;
3269 t = ctime[l].second / scale0;
3270 a = 1.0e9 * t / nc;
3271 *p_t_total += t;
3272 os << fmt::sprintf(" + %.2fs [%.2fs] ; n=%-9" PRIu64 " ; %5.2f ns/c ;"
3273 " scaled*%.2f : %5.2f/c\n",
3274 t, *p_t_total, nc, a, scale, a * scale);
3275 }
3276 hdr--;
3277 return os;
3278 }
3279
3280
MATMUL_NAME(report)3281 void MATMUL_NAME(report)(matmul_ptr mm0, double scale)
3282 {
3283 struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
3284 uint64_t scale0;
3285 scale0 = (mm->public_->iteration[0] + mm->public_->iteration[1]);
3286
3287 std::ostringstream os;
3288
3289 vector<slice_header_t>::iterator hdr;
3290
3291 os << fmt::sprintf("n %" PRIu64 " %.3fs/iter (wct of cpu-bound loop)\n",
3292 mm->public_->ncoeffs,
3293 mm->main_timing.t / scale0
3294 );
3295
3296 double t_total = 0;
3297 for(hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
3298 if (hdr->t == SLICE_TYPE_SMALL1_VBLOCK) continue;
3299 if (hdr->t == SLICE_TYPE_DEFER_ENVELOPE) {
3300 matmul_bucket_report_vsc(os, mm, scale, hdr, &t_total);
3301 continue;
3302 }
3303 double t = mm->slice_timings[hdr - mm->headers.begin()].t;
3304 uint64_t nc = hdr->ncoeffs;
3305 t /= scale0;
3306 t_total += t;
3307 double a = 1.0e9 * t / nc;
3308 os << fmt::sprintf("%s\t%.2fs [%.2fs] ; n=%-9" PRIu64 " ; %5.2f ns/c ;"
3309 " scaled*%.2f : %5.2f/c\n",
3310 slice_name(hdr->t), t, t_total,
3311 nc, a, scale, a * scale);
3312 }
3313 for(int i = 0 ; i < 40 ; i++) os << '-';
3314 os << '\n';
3315 for(int i = 0 ; i < SLICE_TYPE_MAX ; i++) {
3316 if (i == SLICE_TYPE_SMALL1_VBLOCK) continue;
3317 double t = 0;
3318 uint64_t nc = 0;
3319 for(hdr = mm->headers.begin() ; hdr != mm->headers.end() ; hdr++) {
3320 if (hdr->t != i) continue;
3321 nc += hdr->ncoeffs;
3322 t += mm->slice_timings[hdr - mm->headers.begin()].t;
3323 }
3324 if (nc == 0) continue;
3325 t /= scale0;
3326 double a = 1.0e9 * t / nc;
3327 os << fmt::sprintf("%s\t%.2fs ; n=%-9" PRIu64 " ; %5.2f ns/c ;"
3328 " scaled*%.2f : %5.2f/c\n",
3329 slice_name(i), t,
3330 nc, a, scale, a * scale);
3331 }
3332 if (mm->public_->report_string)
3333 free(mm->public_->report_string);
3334 mm->public_->report_string_size = os.str().size() + 1;
3335 mm->public_->report_string = strdup(os.str().c_str());
3336 }
3337
MATMUL_NAME(auxv)3338 void MATMUL_NAME(auxv)(matmul_ptr mm0, int op MAYBE_UNUSED, va_list ap MAYBE_UNUSED)
3339 {
3340 struct matmul_bucket_data_s * mm = (struct matmul_bucket_data_s *)mm0;
3341 if (op == MATMUL_AUX_ZERO_STATS) {
3342 matmul_bucket_zero_stats(mm);
3343 mm->public_->iteration[0] = 0;
3344 mm->public_->iteration[1] = 0;
3345 }
3346 #if 0
3347 if (op == MATMUL_AUX_GET_READAHEAD) {
3348 unsigned int * res = va_arg(ap, unsigned int *);
3349 ++*res;
3350 }
3351 #endif
3352 }
3353
3354
MATMUL_NAME(aux)3355 void MATMUL_NAME(aux)(matmul_ptr mm0, int op, ...)
3356 {
3357 va_list ap;
3358 va_start(ap, op);
3359 MATMUL_NAME(auxv) (mm0, op, ap);
3360 va_end(ap);
3361 }
3362 /* vim: set sw=4: */
3363