1 #include "cado.h" // IWYU pragma: keep
2 // IWYU pragma: no_include <ext/alloc_traits.h>
3 #include <fstream> // IWYU pragma: keep
4 #ifdef SELECT_MPFQ_LAYER_u64k1
5 #include <limits.h> // for UINT_MAX
6 #include <stdlib.h> // for abort
7 #else
8 #include <gmp.h> // for mp_limb_t, __gmpn_cmp, __gmpn_copyi
9 #endif
10 #include <unistd.h> // pread // IWYU pragma: keep
11 #include "lingen_io_matpoly.hpp"
12 #include "lingen_matpoly_select.hpp"
13 #include "macros.h" // for ASSERT_ALWAYS
14 #include "params.h" // for cxx_param_list, param_list_decl_usage
15 #include "omp_proxy.h" // IWYU pragma: keep
16
17
18 // constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
19 constexpr const unsigned int splitwidth = matpoly::over_gf2 ? 64 : 1;
20
21 /* {{{ I/O helpers */
22
23 /* This is an indication of the number of bytes we read at a time for A
24 * (input) and F (output) */
25 unsigned int io_matpoly_block_size = 1 << 20;
26
lingen_io_matpoly_decl_usage(cxx_param_list & pl)27 void lingen_io_matpoly_decl_usage(cxx_param_list & pl)
28 {
29 param_list_decl_usage(pl, "io-block-size",
30 "chunk size for reading the input or writing the output");
31 }
32
lingen_io_matpoly_lookup_parameters(cxx_param_list & pl)33 void lingen_io_matpoly_lookup_parameters(cxx_param_list & pl)
34 {
35 param_list_lookup_string(pl, "io-block-size");
36 }
37
lingen_io_matpoly_interpret_parameters(cxx_param_list & pl)38 void lingen_io_matpoly_interpret_parameters(cxx_param_list & pl)
39 {
40 param_list_parse_uint(pl, "io-block-size", &(io_matpoly_block_size));
41 }
42
43 /* {{{ matpoly_write
44 * writes some of the matpoly data to f, either in ascii or binary
45 * format. This can be used to write only part of the data (degrees
46 * [k0..k1[). Returns the number of coefficients (i.e., matrices, so at
47 * most k1-k0) successfully written, or
48 * -1 on error (e.g. when some matrix was only partially written).
49 */
50
51 #ifndef SELECT_MPFQ_LAYER_u64k1
matpoly_write(abdst_field ab,std::ostream & os,matpoly const & M,unsigned int k0,unsigned int k1,int ascii,int transpose)52 int matpoly_write(abdst_field ab, std::ostream& os, matpoly const & M, unsigned int k0, unsigned int k1, int ascii, int transpose)
53 {
54 unsigned int m = transpose ? M.n : M.m;
55 unsigned int n = transpose ? M.m : M.n;
56 ASSERT_ALWAYS(k0 == k1 || (k0 < M.get_size() && k1 <= M.get_size()));
57 for(unsigned int k = k0 ; k < k1 ; k++) {
58 int err = 0;
59 int matnb = 0;
60 for(unsigned int i = 0 ; !err && i < m ; i++) {
61 for(unsigned int j = 0 ; !err && j < n ; j++) {
62 absrc_elt x;
63 x = transpose ? M.coeff(j, i, k) : M.coeff(i, j, k);
64 if (ascii) {
65 if (j) err = !(os << " ");
66 if (!err) err = !(abcxx_out(ab, os, x));
67 } else {
68 err = !(os.write((const char *) x, (size_t) abvec_elt_stride(ab, 1)));
69 }
70 if (!err) matnb++;
71 }
72 if (!err && ascii) err = !(os << "\n");
73 }
74 if (ascii) err = err || !(os << "\n");
75 if (err) {
76 return (matnb == 0) ? (int) (k - k0) : -1;
77 }
78 }
79 return k1 - k0;
80 }
81 #else
matpoly_write(abdst_field,std::ostream & os,matpoly const & M,unsigned int k0,unsigned int k1,int ascii,int transpose)82 int matpoly_write(abdst_field, std::ostream& os, matpoly const & M, unsigned int k0, unsigned int k1, int ascii, int transpose)
83 {
84 unsigned int m = M.m;
85 unsigned int n = M.n;
86 ASSERT_ALWAYS(k0 == k1 || (k0 < M.get_size() && k1 <= M.get_size()));
87 unsigned int mc = iceildiv(M.m, ULONG_BITS);
88 unsigned int nc = iceildiv(M.n, ULONG_BITS);
89 size_t ulongs_per_mat = transpose ? (M.n * mc) : (M.m * nc);
90 std::vector<unsigned long> buf(ulongs_per_mat);
91 for(unsigned int k = k0 ; k < k1 ; k++) {
92 buf.assign(ulongs_per_mat, 0);
93 bool err = false;
94 size_t kq = k / ULONG_BITS;
95 size_t kr = k % ULONG_BITS;
96 unsigned long km = 1UL << kr;
97 if (!transpose) {
98 for(unsigned int i = 0 ; i < m ; i++) {
99 unsigned long * v = &(buf[i * nc]);
100 for(unsigned int j = 0 ; j < n ; j++) {
101 unsigned int jq = j / ULONG_BITS;
102 unsigned int jr = j % ULONG_BITS;
103 unsigned long bit = (M.part(i, j)[kq] & km) != 0;
104 v[jq] |= bit << jr;
105 }
106 }
107 } else {
108 for(unsigned int j = 0 ; j < n ; j++) {
109 unsigned long * v = &(buf[j * mc]);
110 for(unsigned int i = 0 ; i < m ; i++) {
111 unsigned int iq = i / ULONG_BITS;
112 unsigned int ir = i % ULONG_BITS;
113 unsigned long bit = (M.part(i, j)[kq] & km) != 0;
114 v[iq] |= bit << ir;
115 }
116 }
117 }
118 if (ascii) {
119 /* do we have an endian-robust wordsize-robust convention for
120 * printing bitstrings in hex ?
121 *
122 * it's not even clear that we should care -- after all, as
123 * long as mksol follows a consistent convention too, we
124 * should be fine.
125 */
126 abort();
127 } else {
128 err = !(os.write((const char*) buf.data(), ulongs_per_mat * sizeof(unsigned long)));
129 }
130 if (err) return -1;
131 }
132 return k1 - k0;
133 }
134 #endif
135 /* }}} */
136
137 #define VOID_POINTER_ADD(x, k) (((char*)(x))+(k))
138
139 /* fw must be an array of ofstreams of exactly the same size as the
140 * matrix to be written.
141 */
matpoly_write_split(abdst_field ab MAYBE_UNUSED,std::vector<std::ofstream> & fw,matpoly const & M,unsigned int k0,unsigned int k1,int ascii)142 int matpoly_write_split(abdst_field ab MAYBE_UNUSED, std::vector<std::ofstream> & fw, matpoly const & M, unsigned int k0, unsigned int k1, int ascii)
143 {
144 ASSERT_ALWAYS(k0 == k1 || (k0 < M.get_size() && k1 <= M.get_size()));
145 #ifdef SELECT_MPFQ_LAYER_u64k1
146 size_t ulongs_per_mat = splitwidth * splitwidth / ULONG_BITS;
147 std::vector<unsigned long> buf(ulongs_per_mat);
148 #endif
149 for(unsigned int k = k0 ; k < k1 ; k++) {
150 int err = 0;
151 int matnb = 0;
152 for(unsigned int i = 0 ; !err && i < M.m ; i += splitwidth) {
153 for(unsigned int j = 0 ; !err && j < M.n ; j += splitwidth) {
154 std::ostream& os = fw[i/splitwidth*M.n/splitwidth+j/splitwidth];
155 #ifndef SELECT_MPFQ_LAYER_u64k1
156 absrc_elt x = M.coeff(i, j, k);
157 if (ascii) {
158 err = !(abcxx_out(ab, os, x));
159 if (!err) err = !(os << "\n");
160 } else {
161 err = !(os.write((const char *) x, (size_t) abvec_elt_stride(ab, 1)));
162 }
163 #else
164 if (ascii)
165 abort();
166 buf.assign(ulongs_per_mat, 0);
167 size_t kq = k / ULONG_BITS;
168 size_t kr = k % ULONG_BITS;
169 for(unsigned int di = 0 ; di < splitwidth ; di++) {
170 unsigned int ii = i + di;
171 unsigned int ulongs_per_row = splitwidth / ULONG_BITS;
172 for(unsigned int dj0 = 0 ; dj0 < ulongs_per_row ; dj0++) {
173 for(unsigned int dj = 0 ; dj < ULONG_BITS ; dj++) {
174 unsigned int jj = j + dj0 * ULONG_BITS + dj;
175 const unsigned long * mij = M.part(ii, jj);
176 unsigned long bit = (mij[kq] >> kr) & 1;
177 buf[di * ulongs_per_row + dj0] ^= bit << dj;
178 }
179 }
180 }
181 err = !(os.write((const char *) buf.data(), ulongs_per_mat * sizeof(unsigned long)));
182 #endif
183 if (!err) matnb++;
184 }
185 }
186 if (err) {
187 return (matnb == 0) ? (int) (k - k0) : -1;
188 }
189 }
190 return k1 - k0;
191 }
192 /* }}} */
193
194 /* {{{ matpoly_read
195 * reads some of the matpoly data from f, either in ascii or binary
196 * format. This can be used to parse only part of the data (degrees
197 * [k0..k1[, k1 being an upper bound). Returns the number of coefficients
198 * (i.e., matrices, so at most k1-k0) successfully read, or
199 * -1 on error (e.g. when some matrix was only partially read).
200 *
201 * Note that the matrix must *not* be in pre-init state. It must have
202 * been already allocated.
203 */
204
205 #ifndef SELECT_MPFQ_LAYER_u64k1
matpoly_read(abdst_field ab,FILE * f,matpoly & M,unsigned int k0,unsigned int k1,int ascii,int transpose)206 int matpoly_read(abdst_field ab, FILE * f, matpoly & M, unsigned int k0, unsigned int k1, int ascii, int transpose)
207 {
208 ASSERT_ALWAYS(!M.check_pre_init());
209 unsigned int m = transpose ? M.n : M.m;
210 unsigned int n = transpose ? M.m : M.n;
211 ASSERT_ALWAYS(k0 == k1 || (k0 < M.get_size() && k1 <= M.get_size()));
212 mpz_srcptr pz = abfield_characteristic_srcptr(ab);
213 #if GMP_VERSION_ATLEAST(6, 0, 0)
214 mp_size_t pzsize = mpz_size(pz);
215 const mp_limb_t * pzlimbs = mpz_limbs_read(pz);
216 #else
217 mp_size_t pzsize = SIZ(pz);
218 const mp_limb_t * pzlimbs = PTR(pz);
219 #endif
220 std::vector<mp_limb_t> vbuf(pzsize + 1);
221 mp_limb_t * buf = vbuf.data();
222 for(unsigned int k = k0 ; k < k1 ; k++) {
223 int err = 0;
224 int matnb = 0;
225 for(unsigned int i = 0 ; !err && i < m ; i++) {
226 for(unsigned int j = 0 ; !err && j < n ; j++) {
227 abdst_elt x;
228 x = transpose ? M.coeff(j, i, k)
229 : M.coeff(i, j, k);
230 if (ascii) {
231 err = abfscan(ab, f, x) == 0;
232 } else {
233 err = fread(x, abvec_elt_stride(ab, 1), 1, f) < 1;
234 if (mpn_cmp(x, pzlimbs, pzsize >= 0)) {
235 mpn_tdiv_qr(buf + pzsize, buf, 0, x, pzsize, pzlimbs, pzsize);
236 mpn_copyi(x, buf, pzsize);
237 }
238 }
239 if (!err) matnb++;
240 }
241 }
242 if (err) {
243 if (matnb == 0)
244 return (int) (k - k0);
245 fprintf(stderr, "I/O error after %d,%d coefficients (matrix size %u*%u)\n",
246 (int) (k-k0), matnb, m, n);
247 return -1;
248 }
249 }
250 return k1 - k0;
251 }
252 #else
matpoly_read_inner(abdst_field,FILE * f,matpoly & M,unsigned int k0,unsigned int k1,int ascii,int transpose,off_t base,unsigned int batch=1)253 int matpoly_read_inner(abdst_field, FILE * f, matpoly & M, unsigned int k0, unsigned int k1, int ascii, int transpose, off_t base, unsigned int batch = 1)
254 {
255 /* Internally, the dimension of the matrix that are most packed
256 * (i.e., rows when in row-major order) are padded to multiples of
257 * ULONG_BITS
258 */
259 unsigned int m = M.m;
260 unsigned int n = M.n;
261 ASSERT_ALWAYS(k0 == k1 || (k0 < M.get_size() && k1 <= M.get_size()));
262 unsigned int mc = iceildiv(M.m, ULONG_BITS);
263 unsigned int nc = iceildiv(M.n, ULONG_BITS);
264 size_t ulongs_per_mat = transpose ? (M.n * mc) : (M.m * nc);
265 batch = MIN(batch, k1 - k0);
266 std::vector<unsigned long> buf(ulongs_per_mat * batch);
267 for(unsigned int k = k0 ; k < k1 ; k+=batch) {
268 if (k + batch > k1) batch = k1 - k;
269 unsigned int good;
270 if (ascii) {
271 /* do we have an endian-robust wordsize-robust convention for
272 * printing bitstrings in hex ?
273 *
274 * it's not even clear that we should care -- after all, as long as
275 * mksol follows a consistent convention too, we should be fine.
276 */
277 abort();
278 } else {
279 int rc;
280 if (base < 0) {
281 rc = fread(buf.data(), sizeof(unsigned long), ulongs_per_mat * batch, f);
282 if (rc != (int) (ulongs_per_mat * batch) && ferror(f))
283 return k - k0;
284 } else {
285 /* use pread -- good for multithreading */
286 size_t one = ulongs_per_mat * sizeof(unsigned long);
287 off_t off = base + (k - k0) * one;
288 ssize_t r = pread(fileno(f), buf.data(), one * batch, off);
289 if (r < 0) rc = 0;
290 else rc = r / sizeof(unsigned long);
291 }
292 good = rc / ulongs_per_mat;
293 }
294 for(unsigned int b = 0 ; b < good ; b++) {
295 size_t kq = (k + b) / ULONG_BITS;
296 size_t kr = (k + b) % ULONG_BITS;
297 if (!transpose) {
298 for(unsigned int i = 0 ; i < m ; i++) {
299 unsigned long * v = &(buf[b * ulongs_per_mat + i * nc]);
300 for(unsigned int j = 0 ; j < n ; j++) {
301 unsigned int jq = j / ULONG_BITS;
302 unsigned int jr = j % ULONG_BITS;
303 unsigned long bit = (v[jq] >> jr) & 1;
304 M.part(i, j)[kq] &= ~(1UL << kr);
305 M.part(i, j)[kq] |= bit << kr;
306 }
307 }
308 } else {
309 for(unsigned int j = 0 ; j < n ; j++) {
310 unsigned long * v = &(buf[b * ulongs_per_mat + j * mc]);
311 for(unsigned int i = 0 ; i < m ; i++) {
312 unsigned int iq = i / ULONG_BITS;
313 unsigned int ir = i % ULONG_BITS;
314 unsigned long bit = (v[iq] >> ir) & 1;
315 M.part(i, j)[kq] &= ~(1UL << kr);
316 M.part(i, j)[kq] |= bit << kr;
317 }
318 }
319 }
320 }
321 if (good < batch) {
322 return k - k0 + good;
323 }
324 }
325 return k1 - k0;
326 }
matpoly_read(abdst_field ab,FILE * f,matpoly & M,unsigned int k0,unsigned int k1,int ascii,int transpose)327 int matpoly_read(abdst_field ab, FILE * f, matpoly & M, unsigned int k0, unsigned int k1, int ascii, int transpose)
328 {
329 int rc = 0;
330 if (k0 % ULONG_BITS) {
331 unsigned int fk0 = MIN(k1, k0 + ULONG_BITS - (k0 % ULONG_BITS));
332 rc += matpoly_read_inner(ab, f, M, k0, fk0, ascii, transpose, -1);
333 if (rc < (int) (fk0 - k0) || fk0 == k1)
334 return rc;
335 return rc + matpoly_read(ab, f, M, fk0, k1, ascii, transpose);
336 }
337 if (k1 % ULONG_BITS) {
338 unsigned int fk1 = MAX(k0, k1 - (k1 % ULONG_BITS));
339 if (k0 < fk1) {
340 /* recurse and to the bulk of the processing on aligned
341 * values */
342 rc += matpoly_read(ab, f, M, k0, fk1, ascii, transpose);
343 if (rc < (int) (fk1 - k0))
344 return rc;
345 }
346 return rc + matpoly_read_inner(ab, f, M, fk1, k1, ascii, transpose, -1);
347 }
348
349 ASSERT_ALWAYS(!(k0 % ULONG_BITS));
350 ASSERT_ALWAYS(!(k1 % ULONG_BITS));
351
352 off_t pos0 = ftell(f);
353 unsigned int mc = iceildiv(M.m, ULONG_BITS);
354 unsigned int nc = iceildiv(M.n, ULONG_BITS);
355 size_t ulongs_per_mat = transpose ? (M.n * mc) : (M.m * nc);
356 size_t one = ulongs_per_mat * sizeof(unsigned long);
357
358 unsigned int nth = 1;
359 #ifdef HAVE_OPENMP
360 nth = omp_get_max_threads();
361 #endif
362 unsigned int dk = ((k1 - k0)/ULONG_BITS) / nth;
363 unsigned int mk = ((k1 - k0)/ULONG_BITS) % nth;
364 #ifdef HAVE_OPENMP
365 #pragma omp parallel num_threads(nth)
366 #endif
367 {
368 unsigned int idx = 0;
369 #ifdef HAVE_OPENMP
370 idx = omp_get_thread_num();
371 #endif
372 unsigned int lk0 = k0 / ULONG_BITS + idx * dk + MIN(idx, mk);
373 unsigned int lk1 = lk0 + dk + (idx < mk);
374 lk0 *= ULONG_BITS;
375 lk1 *= ULONG_BITS;
376
377 off_t base = pos0 + (lk0 - k0) * one;
378 int my_rc = matpoly_read_inner(ab, f, M, lk0, lk1, ascii, transpose, base, UINT_MAX);
379
380 #ifdef HAVE_OPENMP
381 #pragma omp critical
382 #endif
383 rc += my_rc;
384 }
385
386 fseek(f, pos0 + rc * one, SEEK_SET);
387 return rc;
388 }
389
390 #endif
391 /* }}} */
392
393