1 /* Manage the in-memory data for the matrix */
2 /* It's in C++ because the STL is handy, but that's really all there is
3 * to it... */
4 #include "cado.h" // IWYU pragma: keep
5 #include <cstdio>
6 #include <cmath>
7 #include <cstdint>
8 #include <cstring>
9 #include <cstdarg> // for va_list, va_end, va_start
10
11 // C++ headers.
12 #include <utility> // for pair, make_pair
13 #include <vector>
14 #include <algorithm> // sort
15 #include <iostream> // cout
16
17 using namespace std;
18
19 #include "matmul.h" // for matmul_ptr, matmul_public_s
20 #include "mpfq_layer.h"
21 #include "matmul-common.h"
22 #include "matmul_facade.h"
23 #include "macros.h"
24 #include "params.h"
25
26 // assembly is now disabled for this code because the semantics of the
27 // asm code have changed.
28
29 // #define L1_CACHE_SIZE 32768
30 // take only 3/4 of the L1 cache.
31 #define L1_CACHE_SIZE 24576
32 // for 8-bytes abelt values, this gives 3072 items.
33
34 /* Here is how the matrix is stored in memory, in the
35 * "matmul_sliced_data_s" type. The matrix is cut in slices, where each
36 * slice is a set of contiguous rows. The size of a slice is tuned so as
37 * to fill the L1 cache as per the macro above, with some adjustment to
38 * handle non-exact divisibility: some slices will have packbase rows and
39 * some others packbase+1 rows. Within a slice, entries are stored as a
40 * list of pairs
41 * (column index, row index),
42 * sorted according to column index. Then, only the difference between
43 * two consecutive column indices is actually stored, so that it will fit
44 * in 16 bits. Also the row index fits in 16 bits, so that a slice is
45 * actually a list of 16-bit unsigned integers.
46 * All the slices are stored in a big array of uint16_t, in the data
47 * field. Within this data field, the information is organized like so:
48 * data[0] : total number of slices
49 * data[1] : number of rows in slice number 1
50 * data[2..3]: number of entries in slice number 1, say k_1
51 * data[4..(4+2*k_1)]: list of entries of slice number 1
52 * data[(4+2k_1+1)]: number of rows in slice number 2
53 * etc...
54 * Additionally, for each slice, a slice_info structure is stored, giving
55 * statistics about the slice, and the offset in the data array where it
56 * is stored.
57 */
58
59 /* This extension is used to distinguish between several possible
60 * implementations of the product. The upper word correspond to the
61 * implementation itself, the lower one to the n-th binary incompatible
62 * change (make sure to bump it) */
63 #define MM_EXTENSION "-sliced"
64
65 #define MM_MAGIC_FAMILY 0xa000UL
66 #define MM_MAGIC_VERSION 0x1007UL
67 #define MM_MAGIC (MM_MAGIC_FAMILY << 16 | MM_MAGIC_VERSION)
68
69 /* see matmul-basic.c */
70 #define MM_DIR0_PREFERS_TRANSP_MULT 1
71
72 struct slice_info {
73 unsigned int nrows;
74 unsigned int ncoeffs;
75 double spent_time;
76 unsigned int npasses;
77 double dj_avg;
78 double dj_sdev;
79 double di_avg;
80 double di_sdev;
81 int32_t di_max;
82 size_t data_offset;
83 };
84
85 typedef vector<uint16_t> data_t;
86
87 struct matmul_sliced_data_s {
88 /* repeat the fields from the public_ interface */
89 struct matmul_public_s public_[1];
90 /* now our private fields */
91 abdst_field xab;
92 data_t data;
93 vector<slice_info> dslices_info;
94 unsigned int npack;
pushmatmul_sliced_data_s95 void push(uint32_t x)
96 {
97 ASSERT_ALWAYS(x >> 16 == 0);
98 data.push_back(x);
99 }
push32matmul_sliced_data_s100 void push32(uint64_t x)
101 {
102 ASSERT_ALWAYS(x >> 32 == 0);
103 data.push_back(x & ((1u << 16) - 1));
104 x >>= 16;
105 data.push_back(x & ((1u << 16) - 1));
106 }
read32matmul_sliced_data_s107 static inline uint32_t read32(data_t::const_iterator & q) {
108 uint32_t res;
109 res = *q++;
110 res |= ((uint32_t) *q++) << 16;
111 return res;
112 }
read32matmul_sliced_data_s113 static inline uint32_t read32(const uint16_t * & q) {
114 uint32_t res;
115 res = *q++;
116 res |= ((uint32_t) *q++) << 16;
117 return res;
118 }
119 };
120
MATMUL_NAME(clear)121 void MATMUL_NAME(clear)(matmul_ptr mm0)
122 {
123 struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
124 matmul_common_clear(mm->public_);
125 mm->dslices_info.clear();
126 mm->data.clear();
127 delete mm;
128 }
129
MATMUL_NAME(init)130 matmul_ptr MATMUL_NAME(init)(void * pxx, param_list pl, int optimized_direction)
131 {
132 abdst_field xx = (abdst_field) pxx;
133 struct matmul_sliced_data_s * mm;
134 mm = new matmul_sliced_data_s;
135 // see bug 21663
136 memset((void*)mm, 0, sizeof(struct matmul_sliced_data_s));
137 mm->xab = xx;
138
139 unsigned int npack = L1_CACHE_SIZE;
140 if (pl) param_list_parse_uint(pl, "l1_cache_size", &npack);
141 npack /= sizeof(abelt);
142 mm->npack = npack;
143
144 int suggest = optimized_direction ^ MM_DIR0_PREFERS_TRANSP_MULT;
145 mm->public_->store_transposed = suggest;
146 if (pl) {
147 param_list_parse_int(pl, "mm_store_transposed",
148 &mm->public_->store_transposed);
149 if (mm->public_->store_transposed != suggest) {
150 fprintf(stderr, "Warning, mm_store_transposed"
151 " overrides suggested matrix storage ordering\n");
152 }
153 }
154
155
156 return (matmul_ptr) mm;
157 }
158
159
MATMUL_NAME(build_cache)160 void MATMUL_NAME(build_cache)(matmul_ptr mm0, uint32_t * data, size_t size MAYBE_UNUSED)
161 {
162 struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
163 ASSERT_ALWAYS(data);
164
165 uint32_t i0 = 0;
166 uint32_t i1 = mm->public_->dim[ mm->public_->store_transposed];
167
168 unsigned int nslices = iceildiv(i1-i0, mm->npack);
169 unsigned int nslices_index = mm->data.size();
170 mm->push(0);
171 mm->push(0); // placeholder for alignment
172
173 unsigned int packbase = (i1-i0) / nslices;
174
175 unsigned int nslices1 = (i1-i0) % nslices; /* 1+packbase-sized slices */
176 unsigned int nslices0 = nslices - nslices1; /* packbase-sized slices */
177 unsigned int current;
178 unsigned int next = i0;
179 unsigned int s;
180
181 uint32_t * ptr = data;
182
183 for(s = 0 ; s < nslices ; s++) {
184 current = next;
185 unsigned int npack = packbase + (s < nslices1);
186 next = current + npack;
187
188 slice_info si; memset(&si,0,sizeof(si)); si.nrows = npack;
189 si.data_offset = mm->data.size();
190
191 /*
192 std::cout << "Packing " << npack << " rows from " << current
193 << " to " << next << std::endl;
194 */
195 typedef std::vector<std::pair<uint32_t, uint32_t> > L_t;
196 typedef L_t::const_iterator Lci_t;
197 L_t L;
198 for(unsigned int di = 0 ; di < npack ; di++) {
199 for(unsigned int j = 0 ; j < *ptr ; j++) {
200 L.push_back(std::make_pair(ptr[1+j],di));
201 }
202 mm->public_->ncoeffs += *ptr;
203 ptr += 1 + *ptr;
204 }
205 /* L is the list of (column index, row index) of all
206 * coefficients in the current horizontal slice */
207 std::sort(L.begin(), L.end());
208
209 mm->push32(npack);
210 mm->push32(L.size());
211
212 uint32_t j = 0;
213 uint32_t i = 0;
214 int32_t di_max = 0;
215 double sumdj=0;
216 double sumdj2=0;
217 double sumdi=0;
218 double sumdi2=0;
219 uint32_t weight=0;
220 for(Lci_t lp = L.begin() ; lp != L.end() ; ++lp) {
221 uint32_t dj = lp->first - j; j += dj;
222 int32_t di = lp->second - i; i += di;
223 weight++;
224
225 if (di<0) di = -di;
226 if (di > di_max) di_max = di;
227
228 double ddj = dj;
229 double ddi = di;
230 sumdj+=ddj;
231 sumdj2+=ddj*ddj;
232 sumdi+=ddi;
233 sumdi2+=ddi*ddi;
234
235 /* If di exceeds +/- 128, then we can emulate the
236 * wide displacement by something like:
237 * (di_max,0)(0,0)(di_max,0)(0,0)...(di_remainder,dj)
238 *
239 * It's also really likely that one would gain by
240 * moving _both_ up and down in the strip, so that
241 * the negative offsets remain significant
242 * (otherwise, we'd rather do a cmov based on the
243 * sign, or something).
244 */
245
246 mm->push(dj); mm->push(i);
247 }
248 mm->data[nslices_index]++;
249 double dj2_avg = sumdj2 / weight;
250 double dj_avg = sumdj / weight;
251 double dj_sdev = sqrt(dj2_avg-dj_avg*dj_avg);
252 double di2_avg = sumdi2 / weight;
253 double di_avg = sumdi / weight;
254 double di_sdev = sqrt(di2_avg-di_avg*di_avg);
255 si.dj_avg = dj_avg;
256 si.dj_sdev = dj_sdev;
257 si.di_avg = di_avg;
258 si.di_sdev = di_sdev;
259 si.ncoeffs = weight;
260
261 #ifdef SPARSE_STRIPS
262 if (si.dj_avg > 0.5) {
263 std::cerr << "hstrip #" << s
264 << " is too sparse ; switching to vstrips\n";
265 // data.erase(data.begin()+si.data_offset, data.end());
266 // break;
267 }
268 #endif
269 mm->dslices_info.push_back(si);
270 }
271
272 /* There's an other option for the dense strips. For a given
273 * set of source values (fitting in L2), store delta_j's for
274 * a given destination value. Could actually be eight bits.
275 * When the direction changes, increase the destination
276 * pointer
277 */
278 #ifdef SPARSE_STRIPS
279 /* We haven't finished the set of horizontal strips. Treat
280 * them as vertical sparse strips then. */
281 #endif
282
283 /* TODO gather info directly from the dslices_info vector
284 * now that it has so much data */
285 if (nslices1) {
286 std::cout
287 << "// " << nslices1
288 << " sub-slices of " << (packbase+1)
289 << " " << rowcol[mm->public_->store_transposed] << "s\n";
290 }
291 std::cout
292 << "// " << nslices0
293 << " sub-slices of " << packbase << " "
294 << rowcol[mm->public_->store_transposed] << "s\n";
295 #ifdef SLICE_STATS
296 std::cout << "info per sub-slice:";
297 typedef std::vector<slice_info>::const_iterator si_t;
298 for(si_t s = dslices_info.begin() ; s != dslices_info.end() ; s++) {
299 std::cout
300 << (s-dslices_info.begin())
301 << " " << s->dj_avg << " " << s->dj_sdev
302 << " " << s->di_avg << " " << s->di_sdev
303 << " " << s->di_max
304 << "\n";
305 }
306 // std::cout << "\n";
307 #endif
308 std::cout << std::flush;
309 }
310
MATMUL_NAME(reload_cache)311 int MATMUL_NAME(reload_cache)(matmul_ptr mm0)
312 {
313 FILE * f;
314
315 struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
316 f = matmul_common_reload_cache_fopen(sizeof(abelt), mm->public_, MM_MAGIC);
317 if (!f) return 0;
318
319 size_t n;
320 MATMUL_COMMON_READ_ONE32(n, f);
321
322 mm->data.resize(n);
323 MATMUL_COMMON_READ_MANY16(mm->data.data(), n, f);
324
325 fclose(f);
326
327 // data_t::const_iterator q = mm->data.begin();
328
329 return 1;
330 }
331
MATMUL_NAME(save_cache)332 void MATMUL_NAME(save_cache)(matmul_ptr mm0)
333 {
334 FILE * f;
335
336 struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
337 f = matmul_common_save_cache_fopen(sizeof(abelt), mm->public_, MM_MAGIC);
338 if (!f) return;
339
340 size_t n = mm->data.size();
341 MATMUL_COMMON_WRITE_ONE32(n, f);
342 MATMUL_COMMON_WRITE_MANY16(mm->data.data(), n, f);
343
344 fclose(f);
345 }
346
MATMUL_NAME(mul)347 void MATMUL_NAME(mul)(matmul_ptr mm0, void * xdst, void const * xsrc, int d)
348 {
349 struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
350 ASM_COMMENT("multiplication code");
351 const uint16_t * q = mm->data.data();
352
353 uint16_t nhstrips = *q++;
354 q++; // alignment.
355 uint32_t i = 0;
356 abdst_field x = mm->xab;
357 absrc_vec src = (absrc_vec) (const_cast<void*>(xsrc)); // typical C const problem.
358 abdst_vec dst = (abdst_vec) xdst;
359
360 if (d == !mm->public_->store_transposed) {
361 #ifdef SLICE_STATS
362 std::vector<slice_info>::iterator sit = dslices_info.begin();
363 double tick = oncpu_ticks();
364 #endif
365 for(uint16_t s = 0 ; s < nhstrips ; s++) {
366 uint32_t nrows_packed = matmul_sliced_data_s::read32(q);
367 abelt * where = dst + i;
368 abvec_set_zero(x, where, nrows_packed);
369 ASM_COMMENT("critical loop");
370 /* The external function must have the same semantics as this
371 * code block */
372 uint32_t ncoeffs_slice = matmul_sliced_data_s::read32(q);
373 uint32_t j = 0;
374 for(uint32_t c = 0 ; c < ncoeffs_slice ; c++) {
375 j += *q++;
376 uint32_t di = *q++;
377 abadd(x, where[di], where[di], src[j]);
378 }
379 ASM_COMMENT("end of critical loop");
380 i += nrows_packed;
381 #ifdef SLICE_STATS
382 if (!mm->dslices_info.empty()) {
383 double ntick = oncpu_ticks();
384 sit->spent_time += ntick - tick;
385 tick = ntick;
386 sit->npasses++;
387 sit++;
388 }
389 #endif
390 }
391 } else {
392 /* d == 0 */
393 /* BEWARE, it's a priori sub-optimal ! In practice, the
394 * difference isn't so striking though. */
395 if (mm->public_->iteration[d] == 10) {
396 fprintf(stderr, "Warning: Doing many iterations with bad code\n");
397 }
398
399 abvec_set_zero(x, dst, mm->public_->dim[!d]);
400 for(uint16_t s = 0 ; s < nhstrips ; s++) {
401 uint32_t j = 0;
402 uint32_t nrows_packed = matmul_sliced_data_s::read32(q);
403 ASM_COMMENT("critical loop");
404 uint32_t ncoeffs_slice = matmul_sliced_data_s::read32(q);
405 for(uint32_t c = 0 ; c < ncoeffs_slice ; c++) {
406 j += *q++;
407 uint32_t di = *q++;
408 abadd(x, dst[j], dst[j], src[i+di]);
409 }
410 i += nrows_packed;
411 ASM_COMMENT("end of critical loop");
412 }
413 }
414 ASM_COMMENT("end of multiplication code");
415 mm->public_->iteration[d]++;
416 }
417
MATMUL_NAME(report)418 void MATMUL_NAME(report)(matmul_ptr mm0 MAYBE_UNUSED, double scale MAYBE_UNUSED) {
419 #ifdef SLICE_STATS
420 struct matmul_sliced_data_s * mm = (struct matmul_sliced_data_s *) mm0;
421 if (mm->dslices_info.empty())
422 return;
423 std::ofstream o("dj.stats");
424 o << "// Report of timing per slice\n";
425 o << "// <snum> <nrows> <ncoeffs> <dj> <npasses> <spent> <M0>\n";
426 for(unsigned int s = 0 ; s < dslices_info.size() ; s++) {
427 const slice_info& t(mm->dslices_info[s]);
428 o << s
429 << " " << t.nrows
430 << " " << t.ncoeffs
431 << " " << t.dj_avg
432 << " " << t.npasses
433 << " " << t.spent_time
434 << " " << (t.spent_time/t.npasses/t.ncoeffs * 1.0e9)
435 << " " << t.dj_sdev
436 << " " << t.di_avg
437 << " " << t.di_sdev
438 << " " << t.di_max
439 << "\n";
440 }
441 o << std::flush;
442 #endif
443 }
444
MATMUL_NAME(auxv)445 void MATMUL_NAME(auxv)(matmul_ptr mm0 MAYBE_UNUSED, int op MAYBE_UNUSED, va_list ap MAYBE_UNUSED)
446 {
447 }
448
MATMUL_NAME(aux)449 void MATMUL_NAME(aux)(matmul_ptr mm0, int op, ...)
450 {
451 va_list ap;
452 va_start(ap, op);
453 MATMUL_NAME(auxv) (mm0, op, ap);
454 va_end(ap);
455 }
456 /* vim: set sw=4: */
457