1 /* Manage the in-memory data for the matrix */
2 /* It's in C++ because the STL is handy, but that's really all there is
3  * to it... */
4 #include "cado.h" // IWYU pragma: keep
5 #include <cstdio>
6 #include <cmath>
7 #include <cstdint>
8 #include <cstring>
9 #include <cstdarg>         // for va_list, va_end, va_start
10 
11 // C++ headers.
12 #include <utility>          // for pair, make_pair
13 #include <vector>
14 #include <algorithm>    // sort
15 #include <iostream>     // cout
16 
17 using namespace std;
18 
19 #include "matmul.h"         // for matmul_ptr, matmul_public_s
20 #include "mpfq_layer.h"
21 #include "matmul-common.h"
22 #include "matmul_facade.h"
23 #include "macros.h"
24 #include "params.h"
25 
26 // assembly is now disabled for this code because the semantics of the
27 // asm code have changed.
28 
29 // #define L1_CACHE_SIZE   32768
30 // take only 3/4 of the L1 cache.
31 #define L1_CACHE_SIZE   24576
32 // for 8-bytes abelt values, this gives 3072 items.
33 
34 /* Here is how the matrix is stored in memory, in the
35  * "matmul_sliced_data_s" type.  The matrix is cut in slices, where each
36  * slice is a set of contiguous rows. The size of a slice is tuned so as
37  * to fill the L1 cache as per the macro above, with some adjustment to
38  * handle non-exact divisibility: some slices will have packbase rows and
39  * some others packbase+1 rows.  Within a slice, entries are stored as a
40  * list of pairs
41  *   (column index, row index),
42  * sorted according to column index. Then, only the difference between
43  * two consecutive column indices is actually stored, so that it will fit
44  * in 16 bits. Also the row index fits in 16 bits, so that a slice is
45  * actually a list of 16-bit unsigned integers.
46  * All the slices are stored in a big array of uint16_t, in the data
47  * field. Within this data field, the information is organized like so:
48  *   data[0] : total number of slices
49  *   data[1] : number of rows in slice number 1
50  *   data[2..3]: number of entries in slice number 1, say k_1
51  *   data[4..(4+2*k_1)]: list of entries of slice number 1
52  *   data[(4+2k_1+1)]: number of rows in slice number 2
53  *   etc...
54  * Additionally, for each slice, a slice_info structure is stored, giving
55  * statistics about the slice, and the offset in the data array where it
56  * is stored.
57  */
58 
59 /* This extension is used to distinguish between several possible
60  * implementations of the product. The upper word correspond to the
61  * implementation itself, the lower one to the n-th binary incompatible
62  * change (make sure to bump it) */
63 #define MM_EXTENSION   "-sliced"
64 
65 #define MM_MAGIC_FAMILY        0xa000UL
66 #define MM_MAGIC_VERSION       0x1007UL
67 #define MM_MAGIC (MM_MAGIC_FAMILY << 16 | MM_MAGIC_VERSION)
68 
69 /* see matmul-basic.c */
70 #define MM_DIR0_PREFERS_TRANSP_MULT   1
71 
72 struct slice_info {
73     unsigned int nrows;
74     unsigned int ncoeffs;
75     double spent_time;
76     unsigned int npasses;
77     double dj_avg;
78     double dj_sdev;
79     double di_avg;
80     double di_sdev;
81     int32_t di_max;
82     size_t data_offset;
83 };
84 
85 typedef vector<uint16_t> data_t;
86 
87 struct matmul_sliced_data_s {
88     /* repeat the fields from the public_ interface */
89     struct matmul_public_s public_[1];
90     /* now our private fields */
91     abdst_field xab;
92     data_t data;
93     vector<slice_info> dslices_info;
94     unsigned int npack;
pushmatmul_sliced_data_s95     void push(uint32_t x)
96     {
97         ASSERT_ALWAYS(x >> 16 == 0);
98         data.push_back(x);
99     }
push32matmul_sliced_data_s100     void push32(uint64_t x)
101     {
102         ASSERT_ALWAYS(x >> 32 == 0);
103         data.push_back(x & ((1u << 16) - 1));
104         x >>= 16;
105         data.push_back(x & ((1u << 16) - 1));
106     }
read32matmul_sliced_data_s107     static inline uint32_t read32(data_t::const_iterator & q) {
108         uint32_t res;
109         res = *q++;
110         res |= ((uint32_t) *q++) << 16;
111         return res;
112     }
read32matmul_sliced_data_s113     static inline uint32_t read32(const uint16_t * & q) {
114         uint32_t res;
115         res = *q++;
116         res |= ((uint32_t) *q++) << 16;
117         return res;
118     }
119 };
120 
MATMUL_NAME(clear)121 void MATMUL_NAME(clear)(matmul_ptr mm0)
122 {
123     struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
124     matmul_common_clear(mm->public_);
125     mm->dslices_info.clear();
126     mm->data.clear();
127     delete mm;
128 }
129 
MATMUL_NAME(init)130 matmul_ptr MATMUL_NAME(init)(void * pxx, param_list pl, int optimized_direction)
131 {
132     abdst_field xx = (abdst_field) pxx;
133     struct matmul_sliced_data_s * mm;
134     mm = new matmul_sliced_data_s;
135     // see bug 21663
136     memset((void*)mm, 0, sizeof(struct matmul_sliced_data_s));
137     mm->xab = xx;
138 
139     unsigned int npack = L1_CACHE_SIZE;
140     if (pl) param_list_parse_uint(pl, "l1_cache_size", &npack);
141     npack /= sizeof(abelt);
142     mm->npack = npack;
143 
144     int suggest = optimized_direction ^ MM_DIR0_PREFERS_TRANSP_MULT;
145     mm->public_->store_transposed = suggest;
146     if (pl) {
147         param_list_parse_int(pl, "mm_store_transposed",
148                 &mm->public_->store_transposed);
149         if (mm->public_->store_transposed != suggest) {
150             fprintf(stderr, "Warning, mm_store_transposed"
151                     " overrides suggested matrix storage ordering\n");
152         }
153     }
154 
155 
156     return (matmul_ptr) mm;
157 }
158 
159 
MATMUL_NAME(build_cache)160 void MATMUL_NAME(build_cache)(matmul_ptr mm0, uint32_t * data, size_t size MAYBE_UNUSED)
161 {
162     struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
163     ASSERT_ALWAYS(data);
164 
165     uint32_t i0 = 0;
166     uint32_t i1 = mm->public_->dim[ mm->public_->store_transposed];
167 
168     unsigned int nslices = iceildiv(i1-i0, mm->npack);
169     unsigned int nslices_index = mm->data.size();
170     mm->push(0);
171     mm->push(0);        // placeholder for alignment
172 
173     unsigned int packbase = (i1-i0) / nslices;
174 
175     unsigned int nslices1 = (i1-i0) % nslices; /* 1+packbase-sized slices */
176     unsigned int nslices0 = nslices - nslices1;  /* packbase-sized slices */
177     unsigned int current;
178     unsigned int next = i0;
179     unsigned int s;
180 
181     uint32_t * ptr = data;
182 
183     for(s = 0 ; s < nslices ; s++) {
184         current = next;
185         unsigned int npack = packbase + (s < nslices1);
186         next = current + npack;
187 
188         slice_info si; memset(&si,0,sizeof(si)); si.nrows = npack;
189         si.data_offset = mm->data.size();
190 
191         /*
192            std::cout << "Packing " << npack << " rows from " << current
193            << " to " << next << std::endl;
194            */
195         typedef std::vector<std::pair<uint32_t, uint32_t> > L_t;
196         typedef L_t::const_iterator Lci_t;
197         L_t L;
198         for(unsigned int di = 0 ; di < npack ; di++) {
199             for(unsigned int j = 0 ; j < *ptr ; j++) {
200                 L.push_back(std::make_pair(ptr[1+j],di));
201             }
202             mm->public_->ncoeffs += *ptr;
203             ptr += 1 + *ptr;
204         }
205         /* L is the list of (column index, row index) of all
206          * coefficients in the current horizontal slice */
207         std::sort(L.begin(), L.end());
208 
209         mm->push32(npack);
210         mm->push32(L.size());
211 
212         uint32_t j = 0;
213         uint32_t i = 0;
214         int32_t di_max = 0;
215         double sumdj=0;
216         double sumdj2=0;
217         double sumdi=0;
218         double sumdi2=0;
219         uint32_t weight=0;
220         for(Lci_t lp = L.begin() ; lp != L.end() ; ++lp) {
221             uint32_t dj = lp->first - j; j += dj;
222             int32_t di = lp->second - i; i += di;
223             weight++;
224 
225             if (di<0) di = -di;
226             if (di > di_max) di_max = di;
227 
228             double ddj = dj;
229             double ddi = di;
230             sumdj+=ddj;
231             sumdj2+=ddj*ddj;
232             sumdi+=ddi;
233             sumdi2+=ddi*ddi;
234 
235             /* If di exceeds +/- 128, then we can emulate the
236              * wide displacement by something like:
237              * (di_max,0)(0,0)(di_max,0)(0,0)...(di_remainder,dj)
238              *
239              * It's also really likely that one would gain by
240              * moving _both_ up and down in the strip, so that
241              * the negative offsets remain significant
242              * (otherwise, we'd rather do a cmov based on the
243              * sign, or something).
244              */
245 
246             mm->push(dj); mm->push(i);
247         }
248         mm->data[nslices_index]++;
249         double dj2_avg = sumdj2 / weight;
250         double dj_avg = sumdj / weight;
251         double dj_sdev = sqrt(dj2_avg-dj_avg*dj_avg);
252         double di2_avg = sumdi2 / weight;
253         double di_avg = sumdi / weight;
254         double di_sdev = sqrt(di2_avg-di_avg*di_avg);
255         si.dj_avg = dj_avg;
256         si.dj_sdev = dj_sdev;
257         si.di_avg = di_avg;
258         si.di_sdev = di_sdev;
259         si.ncoeffs = weight;
260 
261 #ifdef  SPARSE_STRIPS
262         if (si.dj_avg > 0.5) {
263             std::cerr << "hstrip #" << s
264                 << " is too sparse ; switching to vstrips\n";
265             // data.erase(data.begin()+si.data_offset, data.end());
266             // break;
267         }
268 #endif
269         mm->dslices_info.push_back(si);
270     }
271 
272     /* There's an other option for the dense strips. For a given
273      * set of source values (fitting in L2), store delta_j's for
274      * a given destination value. Could actually be eight bits.
275      * When the direction changes, increase the destination
276      * pointer
277      */
278 #ifdef  SPARSE_STRIPS
279     /* We haven't finished the set of horizontal strips. Treat
280      * them as vertical sparse strips then. */
281 #endif
282 
283     /* TODO gather info directly from the dslices_info vector
284      * now that it has so much data */
285     if (nslices1) {
286         std::cout
287             << "// " << nslices1
288             << " sub-slices of " << (packbase+1)
289             << " " << rowcol[mm->public_->store_transposed] << "s\n";
290     }
291     std::cout
292         << "// " << nslices0
293         << " sub-slices of " << packbase << " "
294         << rowcol[mm->public_->store_transposed] << "s\n";
295 #ifdef  SLICE_STATS
296     std::cout << "info per sub-slice:";
297     typedef std::vector<slice_info>::const_iterator si_t;
298     for(si_t s = dslices_info.begin() ; s != dslices_info.end() ; s++) {
299         std::cout
300             << (s-dslices_info.begin())
301             << " " << s->dj_avg << " " << s->dj_sdev
302             << " " << s->di_avg << " " << s->di_sdev
303             << " " << s->di_max
304             << "\n";
305     }
306     // std::cout << "\n";
307 #endif
308     std::cout << std::flush;
309 }
310 
MATMUL_NAME(reload_cache)311 int MATMUL_NAME(reload_cache)(matmul_ptr mm0)
312 {
313     FILE * f;
314 
315     struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
316     f = matmul_common_reload_cache_fopen(sizeof(abelt), mm->public_, MM_MAGIC);
317     if (!f) return 0;
318 
319     size_t n;
320     MATMUL_COMMON_READ_ONE32(n, f);
321 
322     mm->data.resize(n);
323     MATMUL_COMMON_READ_MANY16(mm->data.data(), n, f);
324 
325     fclose(f);
326 
327     // data_t::const_iterator q = mm->data.begin();
328 
329     return 1;
330 }
331 
MATMUL_NAME(save_cache)332 void MATMUL_NAME(save_cache)(matmul_ptr mm0)
333 {
334     FILE * f;
335 
336     struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
337     f = matmul_common_save_cache_fopen(sizeof(abelt), mm->public_, MM_MAGIC);
338     if (!f) return;
339 
340     size_t n = mm->data.size();
341     MATMUL_COMMON_WRITE_ONE32(n, f);
342     MATMUL_COMMON_WRITE_MANY16(mm->data.data(), n, f);
343 
344     fclose(f);
345 }
346 
MATMUL_NAME(mul)347 void MATMUL_NAME(mul)(matmul_ptr mm0, void * xdst, void const * xsrc, int d)
348 {
349     struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
350     ASM_COMMENT("multiplication code");
351     const uint16_t * q = mm->data.data();
352 
353     uint16_t nhstrips = *q++;
354     q++;        // alignment.
355     uint32_t i = 0;
356     abdst_field x = mm->xab;
357     absrc_vec src = (absrc_vec) (const_cast<void*>(xsrc)); // typical C const problem.
358     abdst_vec dst = (abdst_vec) xdst;
359 
360     if (d == !mm->public_->store_transposed) {
361 #ifdef  SLICE_STATS
362         std::vector<slice_info>::iterator sit = dslices_info.begin();
363         double tick = oncpu_ticks();
364 #endif
365         for(uint16_t s = 0 ; s < nhstrips ; s++) {
366             uint32_t nrows_packed = matmul_sliced_data_s::read32(q);
367             abelt * where = dst + i;
368             abvec_set_zero(x, where, nrows_packed);
369             ASM_COMMENT("critical loop");
370             /* The external function must have the same semantics as this
371              * code block */
372             uint32_t ncoeffs_slice = matmul_sliced_data_s::read32(q);
373             uint32_t j = 0;
374             for(uint32_t c = 0 ; c < ncoeffs_slice ; c++) {
375                 j += *q++;
376                 uint32_t di = *q++;
377                 abadd(x, where[di], where[di], src[j]);
378             }
379             ASM_COMMENT("end of critical loop");
380             i += nrows_packed;
381 #ifdef  SLICE_STATS
382             if (!mm->dslices_info.empty()) {
383                 double ntick = oncpu_ticks();
384                 sit->spent_time += ntick - tick;
385                 tick = ntick;
386                 sit->npasses++;
387                 sit++;
388             }
389 #endif
390         }
391     } else {
392         /* d == 0 */
393         /* BEWARE, it's a priori sub-optimal ! In practice, the
394          * difference isn't so striking though. */
395         if (mm->public_->iteration[d] == 10) {
396             fprintf(stderr, "Warning: Doing many iterations with bad code\n");
397         }
398 
399         abvec_set_zero(x, dst, mm->public_->dim[!d]);
400         for(uint16_t s = 0 ; s < nhstrips ; s++) {
401             uint32_t j = 0;
402             uint32_t nrows_packed = matmul_sliced_data_s::read32(q);
403             ASM_COMMENT("critical loop");
404             uint32_t ncoeffs_slice = matmul_sliced_data_s::read32(q);
405             for(uint32_t c = 0 ; c < ncoeffs_slice ; c++) {
406                 j += *q++;
407                 uint32_t di = *q++;
408                 abadd(x, dst[j], dst[j], src[i+di]);
409             }
410             i += nrows_packed;
411             ASM_COMMENT("end of critical loop");
412         }
413     }
414     ASM_COMMENT("end of multiplication code");
415     mm->public_->iteration[d]++;
416 }
417 
MATMUL_NAME(report)418 void MATMUL_NAME(report)(matmul_ptr mm0 MAYBE_UNUSED, double scale MAYBE_UNUSED) {
419 #ifdef  SLICE_STATS
420     struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
421     if (mm->dslices_info.empty())
422         return;
423     std::ofstream o("dj.stats");
424     o << "// Report of timing per slice\n";
425     o << "// <snum> <nrows> <ncoeffs> <dj> <npasses> <spent> <M0>\n";
426     for(unsigned int s = 0 ; s < dslices_info.size() ; s++) {
427         const slice_info& t(mm->dslices_info[s]);
428         o << s
429             << " " << t.nrows
430             << " " << t.ncoeffs
431             << " " << t.dj_avg
432             << " " << t.npasses
433             << " " << t.spent_time
434             << " " << (t.spent_time/t.npasses/t.ncoeffs * 1.0e9)
435             << " " << t.dj_sdev
436             << " " << t.di_avg
437             << " " << t.di_sdev
438             << " " << t.di_max
439             << "\n";
440     }
441     o << std::flush;
442 #endif
443 }
444 
MATMUL_NAME(auxv)445 void MATMUL_NAME(auxv)(matmul_ptr mm0 MAYBE_UNUSED, int op MAYBE_UNUSED, va_list ap MAYBE_UNUSED)
446 {
447 }
448 
MATMUL_NAME(aux)449 void MATMUL_NAME(aux)(matmul_ptr mm0, int op, ...)
450 {
451     va_list ap;
452     va_start(ap, op);
453     MATMUL_NAME(auxv) (mm0, op, ap);
454     va_end(ap);
455 }
456 /* vim: set sw=4: */
457