1 /******************************************************************************
2 *
3 * M4RI: Linear Algebra over GF(2)
4 *
5 * Copyright (C) 2007 Gregory Bard <gregory.bard@ieee.org>
6 * Copyright (C) 2009-2013 Martin Albrecht <martinralbrecht+m4ri@googlemail.com>
7 * Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
8 *
9 * Distributed under the terms of the GNU General Public License (GPL)
10 * version 2 or higher.
11 *
12 * This code is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * The full text of the GPL is available at:
18 *
19 * http://www.gnu.org/licenses/
20 ******************************************************************************/
21
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
25
26 #ifdef __M4RI_HAVE_LIBPNG
27 #include <png.h>
28 #endif
29
30 #include <stdlib.h>
31 #include <string.h>
32 #include "mzd.h"
33 #include "parity.h"
34 #include "mmc.h"
35
36
37 /**
38 * \brief Cache of mzd_t containers
39 */
40
41 typedef struct mzd_t_cache {
42 mzd_t mzd[64]; /*!< cached matrices */
43 struct mzd_t_cache *prev; /*!< previous block */
44 struct mzd_t_cache *next; /*!< next block */
45 uint64_t used; /*!< bitmasks which matrices in this block are used */
46 unsigned char padding[sizeof(mzd_t) - 2 * sizeof(struct mzd_t_cache*) - sizeof(uint64_t)]; /*!< alignment */
47 #ifdef __GNUC__
48 } mzd_t_cache_t __attribute__ ((__aligned__ (64)));
49 #else
50 } mzd_t_cache_t;
51 #endif
52
53 #define __M4RI_MZD_T_CACHE_MAX 16
54 static mzd_t_cache_t mzd_cache;
55 static mzd_t_cache_t* current_cache = &mzd_cache;
56
log2_floor(uint64_t v)57 static int log2_floor(uint64_t v) {
58 static uint64_t const b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000 };
59 static unsigned int const S[] = { 1, 2, 4, 8, 16, 32 };
60 unsigned int r = 0;
61 for (int i = 5; i >= 0; --i)
62 {
63 if ((v & b[i]))
64 {
65 v >>= S[i];
66 r |= S[i];
67 }
68 }
69 return r;
70 }
71
72 /*
73 * Return a pointer to a new mzd_t structure.
74 * The structure will be 64 byte aligned.
75 * Call mzd_t_free to free the structure for next use.
76 */
77
mzd_t_malloc()78 static mzd_t* mzd_t_malloc() {
79 #if __M4RI_ENABLE_MZD_CACHE == 0
80 return (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
81 #else
82 mzd_t *ret = NULL;
83 int i=0;
84
85 if (current_cache->used == (uint64_t)-1) {
86 mzd_t_cache_t *cache = &mzd_cache;
87 while (cache && cache->used == (uint64_t)-1) {
88 current_cache = cache;
89 cache = cache->next;
90 i++;
91 }
92 if (!cache && i< __M4RI_MZD_T_CACHE_MAX) {
93 cache = (mzd_t_cache_t*)m4ri_mm_malloc_aligned(sizeof(mzd_t_cache_t), 64);
94 memset((char*)cache, 0, sizeof(mzd_t_cache_t));
95
96 cache->prev = current_cache;
97 current_cache->next = cache;
98 current_cache = cache;
99 } else if (!cache && i>= __M4RI_MZD_T_CACHE_MAX) {
100 /* We have reached the upper limit on the number of caches */
101 ret = (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
102 } else {
103 current_cache = cache;
104 }
105 }
106 if (ret == NULL) {
107 int free_entry =log2_floor(~current_cache->used);
108 current_cache->used |= ((uint64_t)1 << free_entry);
109 ret = ¤t_cache->mzd[free_entry];
110 }
111 return ret;
112 #endif //__M4RI_ENABLE_MZD_CACHE
113 }
114
mzd_t_free(mzd_t * M)115 static void mzd_t_free(mzd_t *M) {
116 #if __M4RI_ENABLE_MZD_CACHE == 0
117 m4ri_mm_free(M);
118 #else
119 int foundit = 0;
120 mzd_t_cache_t *cache = &mzd_cache;
121 while(cache) {
122 size_t entry = M - cache->mzd;
123 if (entry < 64) {
124 cache->used &= ~((uint64_t)1 << entry);
125 if (cache->used == 0) {
126 if (cache == &mzd_cache) {
127 current_cache = cache;
128 } else {
129 if (cache == current_cache) {
130 current_cache = cache->prev;
131 }
132 cache->prev->next = cache->next;
133 if (cache->next)
134 cache->next->prev = cache->prev;
135 m4ri_mm_free(cache);
136 }
137 }
138 foundit = 1;
139 break;
140 }
141 cache = cache->next;
142 }
143 if(!foundit) {
144 m4ri_mm_free(M);
145 }
146 #endif //__M4RI_ENABLE_MZD_CACHE
147 }
148
mzd_init(rci_t r,rci_t c)149 mzd_t *mzd_init(rci_t r, rci_t c) {
150 assert(sizeof(mzd_t) == 64);
151
152 mzd_t *A = mzd_t_malloc();
153
154 A->nrows = r;
155 A->ncols = c;
156 A->width = (c + m4ri_radix - 1) / m4ri_radix;
157 A->rowstride = (A->width < mzd_paddingwidth || (A->width & 1) == 0) ? A->width : A->width + 1;
158 A->high_bitmask = __M4RI_LEFT_BITMASK(c % m4ri_radix);
159 A->flags = (A->high_bitmask != m4ri_ffff) ? mzd_flag_nonzero_excess : 0;
160 A->offset_vector = 0;
161 A->row_offset = 0;
162
163 A->rows = (word**)m4ri_mmc_calloc(r + 1, sizeof(word*)); // We're overcomitting here.
164
165 if (r && c) {
166 int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride;
167 A->blockrows_log = 0;
168 while(blockrows >>= 1)
169 A->blockrows_log++;
170 blockrows = 1 << A->blockrows_log;
171
172 int const blockrows_mask = blockrows - 1;
173 int const nblocks = (r + blockrows - 1) / blockrows;
174 A->flags |= (nblocks > 1) ? mzd_flag_multiple_blocks : 0;
175 A->blocks = (mzd_block_t*)m4ri_mmc_calloc(nblocks + 1, sizeof(mzd_block_t));
176
177 size_t block_words = (r - (nblocks - 1) * blockrows) * A->rowstride;
178 for(int i = nblocks - 1; i >= 0; --i) {
179 A->blocks[i].size = block_words * sizeof(word);
180 A->blocks[i].begin = (word*)m4ri_mmc_calloc(1, A->blocks[i].size);
181 A->blocks[i].end = A->blocks[i].begin + block_words;
182 block_words = blockrows * A->rowstride;
183 }
184
185 for(rci_t i = 0; i < A->nrows; ++i) {
186 A->rows[i] = A->blocks[i >> A->blockrows_log].begin + (i & blockrows_mask) * A->rowstride;
187 }
188
189 } else {
190 A->blocks = NULL;
191 }
192
193 return A;
194 }
195
196 /*
197 Explanation of offset_vector (in words), and row_offset.
198
199 <------------------------------- row_stride (in words)--------------------->
200 .---------------------------------------------------------------------------. <-- m->blocks[0].begin ^
201 | ^ /| |
202 | m->row_offset| m->offset_vector_/ | |
203 | v / | |
204 | .--------------------------------------------------------------------v<--|---- m->rows[0] |_ skipped_blocks (in blocks)
205 | |m (also a window) ^ | | |
206 | | | | | |
207 `---------------------------------|-----------------------------------------' v
208 .---------------------------------|----------------------------------------_. <-- m->blocks[1].begin <-- windows.blocks[0].begin
209 | | ^ lowr| |_^ |
210 | | window->row_offset| | window->offset_vector _-^| |
211 | | v v _-^ | |
212 | | .----------------------------------------------------------v<--. |<--|---- m->rows[lowr]
213 | | |window | `-|---|---- window->rows[0]
214 | | | | | |
215 `---------------------------------------------------------------------------'
216 .---------------------------------------------------------------------------. <-- m->blocks[2].begin <-- windows.blocks[1].begin
217 | | | | | |
218 | | | | lowc | |
219 | | | |<---->| |
220 | | | | \__|___|__ also wrd_offset (in words)
221 | | `----------------------------------------------------------' | |
222 | `--------------------------------------------------------------------' |
223 `---------------------------------------------------------------------------'
224 .---------------------------------------------------------------------------.
225 | |
226
227 */
228
mzd_init_window(mzd_t * M,const rci_t lowr,const rci_t lowc,const rci_t highr,const rci_t highc)229 mzd_t *mzd_init_window(mzd_t *M, const rci_t lowr, const rci_t lowc, const rci_t highr, const rci_t highc) {
230 assert(lowc % m4ri_radix == 0);
231
232 mzd_t *W = mzd_t_malloc();
233
234 rci_t nrows = MIN(highr - lowr, M->nrows - lowr);
235 rci_t ncols = highc - lowc;
236
237 W->nrows = nrows;
238 W->ncols = ncols;
239 W->rowstride = M->rowstride;
240 W->width = (ncols + m4ri_radix - 1) / m4ri_radix;
241 W->high_bitmask = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
242
243 W->flags = mzd_flag_windowed_zerooffset;
244 W->flags |= (ncols % m4ri_radix == 0) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess;
245 W->blockrows_log = M->blockrows_log;
246
247 wi_t const blockrows_mask = (1 << W->blockrows_log) - 1;
248 int const skipped_blocks = (M->row_offset + lowr) >> W->blockrows_log;
249 assert(skipped_blocks == 0 || ((M->flags & mzd_flag_multiple_blocks)));
250 W->row_offset = (M->row_offset + lowr) & blockrows_mask;
251 W->blocks = &M->blocks[skipped_blocks];
252 wi_t const wrd_offset = lowc / m4ri_radix;
253 W->offset_vector = (M->offset_vector + wrd_offset) + (W->row_offset - M->row_offset) * W->rowstride;
254 if(nrows)
255 W->rows = (word**)m4ri_mmc_calloc(nrows + 1, sizeof(word*));
256 else
257 W->rows = NULL;
258 for(rci_t i = 0; i < nrows; ++i) {
259 W->rows[i] = M->rows[lowr + i] + wrd_offset;
260 }
261 if (mzd_row_to_block(W, nrows - 1) > 0)
262 W->flags |= M->flags & mzd_flag_multiple_blocks;
263
264 /* offset_vector is the distance from the start of the first block to the first word of the first row. */
265 assert(nrows == 0 || W->blocks[0].begin + W->offset_vector == W->rows[0]);
266
267 __M4RI_DD_MZD(W);
268 return W;
269 }
270
mzd_free(mzd_t * A)271 void mzd_free(mzd_t *A) {
272 if(A->rows)
273 m4ri_mmc_free(A->rows, (A->nrows + 1) * sizeof(word*));
274 if(mzd_owns_blocks(A)) {
275 int i;
276 for(i = 0; A->blocks[i].size; ++i) {
277 m4ri_mmc_free(A->blocks[i].begin, A->blocks[i].size);
278 }
279 m4ri_mmc_free(A->blocks, (i + 1) * sizeof(mzd_block_t));
280 }
281 mzd_t_free(A);
282 }
283
mzd_row_add(mzd_t * M,rci_t sourcerow,rci_t destrow)284 void mzd_row_add(mzd_t *M, rci_t sourcerow, rci_t destrow) {
285 mzd_row_add_offset(M, destrow, sourcerow, 0);
286 }
287
mzd_row_clear_offset(mzd_t * M,rci_t row,rci_t coloffset)288 void mzd_row_clear_offset(mzd_t *M, rci_t row, rci_t coloffset) {
289 wi_t const startblock = coloffset / m4ri_radix;
290 word temp;
291
292 /* make sure to start clearing at coloffset */
293 if (coloffset%m4ri_radix) {
294 temp = M->rows[row][startblock];
295 temp &= __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset);
296 } else {
297 temp = 0;
298 }
299 M->rows[row][startblock] = temp;
300 for (wi_t i = startblock + 1; i < M->width; ++i) {
301 M->rows[row][i] = 0;
302 }
303
304 __M4RI_DD_ROW(M, row);
305 }
306
307
mzd_gauss_delayed(mzd_t * M,rci_t startcol,int full)308 rci_t mzd_gauss_delayed(mzd_t *M, rci_t startcol, int full) {
309 rci_t startrow = startcol;
310 rci_t pivots = 0;
311 for (rci_t i = startcol; i < M->ncols ; ++i) {
312 for(rci_t j = startrow ; j < M->nrows; ++j) {
313 if (mzd_read_bit(M, j, i)) {
314 mzd_row_swap(M, startrow, j);
315 ++pivots;
316
317 for(rci_t ii = full ? 0 : startrow + 1; ii < M->nrows; ++ii) {
318 if (ii != startrow) {
319 if (mzd_read_bit(M, ii, i)) {
320 mzd_row_add_offset(M, ii, startrow, i);
321 }
322 }
323 }
324 startrow = startrow + 1;
325 break;
326 }
327 }
328 }
329
330 __M4RI_DD_MZD(M);
331 __M4RI_DD_RCI(pivots);
332 return pivots;
333 }
334
mzd_echelonize_naive(mzd_t * M,int full)335 rci_t mzd_echelonize_naive(mzd_t *M, int full) {
336 return mzd_gauss_delayed(M, 0, full);
337 }
338
339 /**
340 * Transpose a 64 x 64 matrix with width 1.
341 *
342 * \param dst First word of destination matrix.
343 * \param src First word of source matrix.
344 * \param rowstride_dst Rowstride of matrix dst.
345 * \param rowstride_src Rowstride of matrix src.
346 *
347 * Rows of both matrices are expected to fit exactly in a word (offset == 0)
348 * and lay entirely inside a single block.
349 *
350 * \note This function also works when dst == src.
351 */
352
_mzd_copy_transpose_64x64(word * dst,word const * src,wi_t rowstride_dst,wi_t rowstride_src)353 static inline void _mzd_copy_transpose_64x64(word* dst, word const* src, wi_t rowstride_dst, wi_t rowstride_src)
354 {
355 /*
356 * m runs over the values:
357 * 0x00000000FFFFFFFF
358 * 0x0000FFFF0000FFFF
359 * 0x00FF00FF00FF00FF
360 * 0x0F0F0F0F0F0F0F0F
361 * 0x3333333333333333
362 * 0x5555555555555555,
363 * alternating j zeroes with j ones.
364 *
365 * Assume we have a matrix existing of four jxj matrices ((0,0) is in the top-right corner,
366 * this is the memory-model view, see the layout on http://m4ri.sagemath.org/doxygen/structmzd__t.html):
367 * ...[A1][B1][A0][B0]
368 * ...[C1][D1][C0][D0]
369 * . [A2][B2]
370 * . [C2][B2]
371 * . .
372 * .
373 * The following calulates the XOR between A and D,
374 * and subsequently applies that to A and D respectively,
375 * swapping A and D as a result.
376 * Therefore wk starts at the first row and then has rowstride
377 * added j times, running over the rows of A, then skips C
378 * by adding j * rowstride to continue with the next A below C.
379 */
380
381 word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
382 wi_t j_rowstride_dst = rowstride_dst * 64;
383 wi_t j_rowstride_src = rowstride_src * 32;
384 word* const end = dst + j_rowstride_dst;
385 // We start with j = 32, and a one-time unrolled loop, where
386 // we copy from src and write the result to dst, swapping
387 // the two 32x32 corner matrices.
388 int j = 32;
389 j_rowstride_dst >>= 1;
390 word* RESTRICT wk = dst;
391 for (word const* RESTRICT wks = src; wk < end; wk += j_rowstride_dst, wks += j_rowstride_src) {
392 for (int k = 0; k < j; ++k, wk += rowstride_dst, wks += rowstride_src) {
393 word xor = ((*wks >> j) ^ *(wks + j_rowstride_src)) & m;
394 *wk = *wks ^ (xor << j);
395 *(wk + j_rowstride_dst) = *(wks + j_rowstride_src) ^ xor;
396 }
397 }
398 // Next we work in-place in dst and swap the corners of
399 // each of the last matrices, all in parallel, for all
400 // remaining values of j.
401 m ^= m << 16;
402 for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
403 j_rowstride_dst >>= 1;
404 for (wk = dst; wk < end; wk += j_rowstride_dst) {
405 for (int k = 0; k < j; ++k, wk += rowstride_dst) {
406 word xor = ((*wk >> j) ^ *(wk + j_rowstride_dst)) & m;
407 *wk ^= xor << j;
408 *(wk + j_rowstride_dst) ^= xor;
409 }
410 }
411 }
412 }
413
414 /**
415 * Transpose two 64 x 64 matrix with width 1.
416 *
417 * \param dst1 First word of destination matrix 1.
418 * \param dst2 First word of destination matrix 2.
419 * \param src1 First word of source matrix 1.
420 * \param src2 First word of source matrix 2.
421 * \param rowstride_dst Rowstride of destination matrices.
422 * \param rowstride_src Rowstride of source matrices.
423 *
424 * Rows of all matrices are expected to fit exactly in a word (offset == 0)
425 * and lay entirely inside a single block.
426 *
427 * \note This function also works to transpose in-place.
428 */
429
_mzd_copy_transpose_64x64_2(word * RESTRICT dst1,word * RESTRICT dst2,word const * RESTRICT src1,word const * RESTRICT src2,wi_t rowstride_dst,wi_t rowstride_src)430 static inline void _mzd_copy_transpose_64x64_2(word* RESTRICT dst1, word* RESTRICT dst2, word const* RESTRICT src1, word const* RESTRICT src2, wi_t rowstride_dst, wi_t rowstride_src)
431 {
432 word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
433 wi_t j_rowstride_dst = rowstride_dst * 64;
434 wi_t j_rowstride_src = rowstride_src * 32;
435 word* const end = dst1 + j_rowstride_dst;
436 int j = 32;
437 word* RESTRICT wk[2];
438 word const* RESTRICT wks[2];
439 word xor[2];
440
441 j_rowstride_dst >>= 1;
442 wk[0] = dst1;
443 wk[1] = dst2;
444 wks[0] = src1;
445 wks[1] = src2;
446
447 do {
448
449 for (int k = 0; k < j; ++k) {
450 xor[0] = ((*wks[0] >> j) ^ *(wks[0] + j_rowstride_src)) & m;
451 xor[1] = ((*wks[1] >> j) ^ *(wks[1] + j_rowstride_src)) & m;
452 *wk[0] = *wks[0] ^ (xor[0] << j);
453 *wk[1] = *wks[1] ^ (xor[1] << j);
454 *(wk[0] + j_rowstride_dst) = *(wks[0] + j_rowstride_src) ^ xor[0];
455 *(wk[1] + j_rowstride_dst) = *(wks[1] + j_rowstride_src) ^ xor[1];
456 wk[0] += rowstride_dst;
457 wk[1] += rowstride_dst;
458 wks[0] += rowstride_src;
459 wks[1] += rowstride_src;
460 }
461
462 wk[0] += j_rowstride_dst;
463 wk[1] += j_rowstride_dst;
464 wks[0] += j_rowstride_src;
465 wks[1] += j_rowstride_src;
466
467 } while(wk[0] < end);
468
469 m ^= m << 16;
470 for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
471
472 j_rowstride_dst >>= 1;
473 wk[0] = dst1;
474 wk[1] = dst2;
475
476 do {
477
478 for (int k = 0; k < j; ++k) {
479 xor[0] = ((*wk[0] >> j) ^ *(wk[0] + j_rowstride_dst)) & m;
480 xor[1] = ((*wk[1] >> j) ^ *(wk[1] + j_rowstride_dst)) & m;
481 *wk[0] ^= xor[0] << j;
482 *wk[1] ^= xor[1] << j;
483 *(wk[0] + j_rowstride_dst) ^= xor[0];
484 *(wk[1] + j_rowstride_dst) ^= xor[1];
485 wk[0] += rowstride_dst;
486 wk[1] += rowstride_dst;
487 }
488
489 wk[0] += j_rowstride_dst;
490 wk[1] += j_rowstride_dst;
491
492 } while(wk[0] < end);
493 }
494 }
495
496 static unsigned char log2_ceil_table[64] = {
497 0, 1, 2, 2, 3, 3, 3, 3,
498 4, 4, 4, 4, 4, 4, 4, 4,
499 5, 5, 5, 5, 5, 5, 5, 5,
500 5, 5, 5, 5, 5, 5, 5, 5,
501 6, 6, 6, 6, 6, 6, 6, 6,
502 6, 6, 6, 6, 6, 6, 6, 6,
503 6, 6, 6, 6, 6, 6, 6, 6,
504 6, 6, 6, 6, 6, 6, 6, 6
505 };
506
log2_ceil(int n)507 static inline int log2_ceil(int n)
508 {
509 return log2_ceil_table[n - 1];
510 }
511
512 static word const transpose_mask[6] = {
513 0x5555555555555555ULL,
514 0x3333333333333333ULL,
515 0x0F0F0F0F0F0F0F0FULL,
516 0x00FF00FF00FF00FFULL,
517 0x0000FFFF0000FFFFULL,
518 0x00000000FFFFFFFFULL,
519 };
520
521 /**
522 * Transpose 64/j matrices of size jxj in parallel.
523 *
524 * Where j equals n rounded up to the nearest power of 2.
525 * The input array t must be of size j (containing the rows i of all matrices in t[i]).
526 *
527 * t[0..{j-1}] = [Al]...[A1][A0]
528 *
529 * \param t An array of j words.
530 * \param n The number of rows in each matrix.
531 *
532 * \return log2(j)
533 */
534
_mzd_transpose_Nxjx64(word * RESTRICT t,int n)535 static inline int _mzd_transpose_Nxjx64(word* RESTRICT t, int n)
536 {
537 int j = 1;
538 int mi = 0; // Index into the transpose_mask array.
539
540 while (j < n) // Don't swap with entirely undefined data (where [D] exists entirely of non-existant rows).
541 {
542 // Swap 64/j matrices of size jxj in 2j rows. Thus,
543 // <---- one word --->
544 // [Al][Bl]...[A0][B0]
545 // [Cl][Dl]...[C0][D0], where l = 64/j - 1 and each matrix [A], [B] etc is jxj.
546 // Then swap [A] and [D] in-place.
547
548 // m runs over the values in transpose_mask, so that at all
549 // times m exists of j zeroes followed by j ones, repeated.
550 word const m = transpose_mask[mi];
551 int k = 0; // Index into t[].
552 do {
553 // Run over all rows of [A] and [D].
554 for (int i = 0; i < j; ++i, ++k) {
555 // t[k] contains row i of all [A], and t[k + j] contains row i of all [D]. Swap them.
556 word xor = ((t[k] >> j) ^ t[k + j]) & m;
557 t[k] ^= xor << j;
558 t[k + j] ^= xor;
559 }
560 k += j; // Skip [C].
561 } while (k < n); // Stop if we passed all valid input.
562
563 // Double the size of j and repeat this for the next 2j rows until all
564 // n rows have been swapped (possibly with non-existant rows).
565 j <<= 1;
566 ++mi;
567 }
568
569 return mi;
570 }
571
572 /**
573 * Transpose a n x 64 matrix with width 1.
574 *
575 * \param dst First word of destination matrix.
576 * \param src First word of source matrix.
577 * \param rowstride_dst Rowstride of destination matrix.
578 * \param rowstride_src Rowstride of source matrix.
579 * \param n Number of rows in source matrix, must be less than 64.
580 *
581 * Rows of all matrices are expected have offset zero
582 * and lay entirely inside a single block.
583 *
584 * \note This function also works to transpose in-place.
585 */
586
_mzd_copy_transpose_lt64x64(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n)587 static inline void _mzd_copy_transpose_lt64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
588 {
589 // Preload the n input rows into level 1, using a minimum of cache lines (compact storage).
590 word t[64];
591 word const* RESTRICT wks = src;
592 int k;
593 for (k = 0; k < n; ++k) {
594 t[k] = *wks;
595 wks += rowstride_src;
596 }
597 // see https://bitbucket.org/malb/m4ri/issues/53
598 for (; k<64; ++k) {
599 t[k] = 0;
600 }
601 if (n > 32) {
602 while (k < 64)
603 t[k++] = 0;
604 _mzd_copy_transpose_64x64(dst, t, rowstride_dst, 1);
605 return;
606 }
607 int log2j = _mzd_transpose_Nxjx64(t, n);
608 // All output bits are now transposed, but still might need to be shifted in place.
609 // What we have now is 64/j matrices of size jxj. Thus,
610 // [Al]...[A1][A0], where l = 64/j - 1.
611 // while the actual output is:
612 // [A0]
613 // [A1]
614 // ...
615 // [Al]
616 word const m = __M4RI_LEFT_BITMASK(n);
617 word* RESTRICT wk = dst;
618 switch (log2j) {
619 case 5:
620 {
621 wi_t const j_rowstride_dst = 32 * rowstride_dst;
622 for (int k = 0; k < 32; ++k) {
623 wk[0] = t[k] & m;
624 wk[j_rowstride_dst] = (t[k] >> 32) & m;
625 wk += rowstride_dst;
626 }
627 break;
628 }
629 case 4:
630 {
631 wi_t const j_rowstride_dst = 16 * rowstride_dst;
632 for (int k = 0; k < 16; ++k) {
633 wk[0] = t[k] & m;
634 wk[j_rowstride_dst] = (t[k] >> 16) & m;
635 wk[2 * j_rowstride_dst] = (t[k] >> 32) & m;
636 wk[3 * j_rowstride_dst] = (t[k] >> 48) & m;
637 wk += rowstride_dst;
638 }
639 break;
640 }
641 case 3:
642 {
643 wi_t const j_rowstride_dst = 8 * rowstride_dst;
644 for (int k = 0; k < 8; ++k) {
645 wk[0] = t[k] & m;
646 wk[j_rowstride_dst] = (t[k] >> 8) & m;
647 wk[2 * j_rowstride_dst] = (t[k] >> 16) & m;
648 wk[3 * j_rowstride_dst] = (t[k] >> 24) & m;
649 wk[4 * j_rowstride_dst] = (t[k] >> 32) & m;
650 wk[5 * j_rowstride_dst] = (t[k] >> 40) & m;
651 wk[6 * j_rowstride_dst] = (t[k] >> 48) & m;
652 wk[7 * j_rowstride_dst] = (t[k] >> 56) & m;
653 wk += rowstride_dst;
654 }
655 break;
656 }
657 case 2:
658 {
659 wi_t const j_rowstride_dst = 4 * rowstride_dst;
660 for (int k = 0; k < 4; ++k) {
661 word* RESTRICT wk2 = wk;
662 word tk = t[k];
663 for (int i = 0; i < 2; ++i) {
664 wk2[0] = tk & m;
665 wk2[j_rowstride_dst] = (tk >> 4) & m;
666 wk2[2 * j_rowstride_dst] = (tk >> 8) & m;
667 wk2[3 * j_rowstride_dst] = (tk >> 12) & m;
668 wk2[4 * j_rowstride_dst] = (tk >> 16) & m;
669 wk2[5 * j_rowstride_dst] = (tk >> 20) & m;
670 wk2[6 * j_rowstride_dst] = (tk >> 24) & m;
671 wk2[7 * j_rowstride_dst] = (tk >> 28) & m;
672 wk2 += 8 * j_rowstride_dst;
673 tk >>= 32;
674 }
675 wk += rowstride_dst;
676 }
677 break;
678 }
679 case 1:
680 {
681 wi_t const j_rowstride_dst = 2 * rowstride_dst;
682 for (int k = 0; k < 2; ++k) {
683 word* RESTRICT wk2 = wk;
684 word tk = t[k];
685 for (int i = 0; i < 8; ++i) {
686 wk2[0] = tk & m;
687 wk2[j_rowstride_dst] = (tk >> 2) & m;
688 wk2[2 * j_rowstride_dst] = (tk >> 4) & m;
689 wk2[3 * j_rowstride_dst] = (tk >> 6) & m;
690 wk2 += 4 * j_rowstride_dst;
691 tk >>= 8;
692 }
693 wk += rowstride_dst;
694 }
695 break;
696 }
697 case 0:
698 {
699 word* RESTRICT wk2 = wk;
700 word tk = t[0];
701 for (int i = 0; i < 16; ++i) {
702 wk2[0] = tk & m;
703 wk2[rowstride_dst] = (tk >> 1) & m;
704 wk2[2 * rowstride_dst] = (tk >> 2) & m;
705 wk2[3 * rowstride_dst] = (tk >> 3) & m;
706 wk2 += 4 * rowstride_dst;
707 tk >>= 4;
708 }
709 break;
710 }
711 }
712 }
713
714 /**
715 * Transpose a 64 x n matrix with width 1.
716 *
717 * \param dst First word of destination matrix.
718 * \param src First word of source matrix.
719 * \param rowstride_dst Rowstride of destination matrix.
720 * \param rowstride_src Rowstride of source matrix.
721 * \param n Number of columns in source matrix, must be less than 64.
722 *
723 * Rows of all matrices are expected have offset zero
724 * and lay entirely inside a single block.
725 *
726 * \note This function also works to transpose in-place.
727 */
728
_mzd_copy_transpose_64xlt64(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n)729 static inline void _mzd_copy_transpose_64xlt64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
730 {
731 word t[64];
732 int log2j = log2_ceil(n);
733 word const* RESTRICT wks = src;
734 switch (log2j) {
735 case 6:
736 {
737 _mzd_copy_transpose_64x64(t, src, 1, rowstride_src);
738 word* RESTRICT wk = dst;
739 for (int k = 0; k < n; ++k) {
740 *wk = t[k];
741 wk += rowstride_dst;
742 }
743 return;
744 }
745 case 5:
746 {
747 wi_t const j_rowstride_src = 32 * rowstride_src;
748 for (int k = 0; k < 32; ++k) {
749 t[k] = wks[0] | (wks[j_rowstride_src] << 32);
750 wks += rowstride_src;
751 }
752 break;
753 }
754 case 4:
755 {
756 wi_t const j_rowstride_src = 16 * rowstride_src;
757 for (int k = 0; k < 16; ++k) {
758 t[k] = wks[0] | (wks[j_rowstride_src] << 16);
759 t[k] |= (wks[2 * j_rowstride_src] << 32) | (wks[3 * j_rowstride_src] << 48);
760 wks += rowstride_src;
761 }
762 break;
763 }
764 case 3:
765 {
766 wi_t const j_rowstride_src = 8 * rowstride_src;
767 word tt;
768 for (int k = 0; k < 8; ++k) {
769 tt = wks[0] | (wks[j_rowstride_src] << 8);
770 t[k] = (wks[2 * j_rowstride_src] << 16) | (wks[3 * j_rowstride_src] << 24);
771 tt |= (wks[4 * j_rowstride_src] << 32) | (wks[5 * j_rowstride_src] << 40);
772 t[k] |= (wks[6 * j_rowstride_src] << 48) | (wks[7 * j_rowstride_src] << 56);
773 wks += rowstride_src;
774 t[k] |= tt;
775 }
776 break;
777 }
778 case 2:
779 {
780 word const* RESTRICT wks2 = wks + 60 * rowstride_src;
781 t[0] = wks2[0];
782 t[1] = wks2[rowstride_src];
783 t[2] = wks2[2 * rowstride_src];
784 t[3] = wks2[3 * rowstride_src];
785 for (int i = 0; i < 15; ++i) {
786 wks2 -= 4 * rowstride_src;
787 t[0] <<= 4;
788 t[1] <<= 4;
789 t[2] <<= 4;
790 t[3] <<= 4;
791 t[0] |= wks2[0];
792 t[1] |= wks2[rowstride_src];
793 t[2] |= wks2[2 * rowstride_src];
794 t[3] |= wks2[3 * rowstride_src];
795 }
796 break;
797 }
798 case 1:
799 {
800 wks += 62 * rowstride_src;
801 t[0] = wks[0];
802 t[1] = wks[rowstride_src];
803 for (int i = 0; i < 31; ++i) {
804 wks -= 2 * rowstride_src;
805 t[0] <<= 2;
806 t[1] <<= 2;
807 t[0] |= wks[0];
808 t[1] |= wks[rowstride_src];
809 }
810 break;
811 }
812 case 0:
813 {
814 word tt[2];
815 tt[0] = wks[0];
816 tt[1] = wks[rowstride_src];
817 for (int i = 2; i < 64; i += 2) {
818 wks += 2 * rowstride_src;
819 tt[0] |= wks[0] << i;
820 tt[1] |= wks[rowstride_src] << i;
821 }
822 *dst = tt[0] | (tt[1] << 1);
823 return;
824 }
825 }
826 int j = 1 << log2j;
827 _mzd_transpose_Nxjx64(t, j);
828 word* RESTRICT wk = dst;
829 for (int k = 0; k < n; ++k) {
830 *wk = t[k];
831 wk += rowstride_dst;
832 }
833 }
834
835 /**
836 * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 8.
837 *
838 * \param dst First word of destination matrix.
839 * \param src First word of source matrix.
840 * \param rowstride_dst Rowstride of destination matrix.
841 * \param rowstride_src Rowstride of source matrix.
842 * \param n Number of rows in source matrix, must be less than or equal 8.
843 * \param m Number of columns in source matrix, must be less than or equal 8.
844 *
845 * Rows of all matrices are expected to have offset zero
846 * and lay entirely inside a single block.
847 *
848 * \note This function also works to transpose in-place.
849 */
850
_mzd_copy_transpose_le8xle8(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n,int m,int maxsize)851 static inline void _mzd_copy_transpose_le8xle8(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
852 {
853 int end = maxsize * 7;
854 word const* RESTRICT wks = src;
855 word w = *wks;
856 int shift = 0;
857 for (int i = 1; i < n; ++i) {
858 wks += rowstride_src;
859 shift += 8;
860 w |= (*wks << shift);
861 }
862 word mask = 0x80402010080402ULL;
863 word w7 = w >> 7;
864 shift = 7;
865 --m;
866 do {
867 word xor = (w ^ w7) & mask;
868 mask >>= 8;
869 w ^= (xor << shift);
870 shift += 7;
871 w7 >>= 7;
872 w ^= xor;
873 } while(shift < end);
874 word* RESTRICT wk = dst + m * rowstride_dst;
875 for (int shift = 8 * m; shift > 0; shift -= 8) {
876 *wk = (unsigned char)(w >> shift);
877 wk -= rowstride_dst;
878 }
879 *wk = (unsigned char)w;
880 }
881
882 /**
883 * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 16.
884 *
885 * \param dst First word of destination matrix.
886 * \param src First word of source matrix.
887 * \param rowstride_dst Rowstride of destination matrix.
888 * \param rowstride_src Rowstride of source matrix.
889 * \param n Number of rows in source matrix, must be less than or equal 16.
890 * \param m Number of columns in source matrix, must be less than or equal 16.
891 *
892 * Rows of all matrices are expected to have offset zero
893 * and lay entirely inside a single block.
894 *
895 * \note This function also works to transpose in-place.
896 */
897
_mzd_copy_transpose_le16xle16(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n,int m,int maxsize)898 static inline void _mzd_copy_transpose_le16xle16(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
899 {
900 int end = maxsize * 3;
901 word const* RESTRICT wks = src;
902 word t[4];
903 int i = n;
904 do {
905 t[0] = wks[0];
906 if (--i == 0) {
907 t[1] = 0;
908 t[2] = 0;
909 t[3] = 0;
910 break;
911 }
912 t[1] = wks[rowstride_src];
913 if (--i == 0) {
914 t[2] = 0;
915 t[3] = 0;
916 break;
917 }
918 t[2] = wks[2 * rowstride_src];
919 if (--i == 0) {
920 t[3] = 0;
921 break;
922 }
923 t[3] = wks[3 * rowstride_src];
924 if (--i == 0)
925 break;
926 wks += 4 * rowstride_src;
927 for(int shift = 16;; shift += 16) {
928 t[0] |= (*wks << shift);
929 if (--i == 0)
930 break;
931 t[1] |= (wks[rowstride_src] << shift);
932 if (--i == 0)
933 break;
934 t[2] |= (wks[2 * rowstride_src] << shift);
935 if (--i == 0)
936 break;
937 t[3] |= (wks[3 * rowstride_src] << shift);
938 if (--i == 0)
939 break;
940 wks += 4 * rowstride_src;
941 }
942 } while(0);
943 word mask = 0xF0000F0000F0ULL;
944 int shift = 12;
945 word xor[4];
946 do {
947 xor[0] = (t[0] ^ (t[0] >> shift)) & mask;
948 xor[1] = (t[1] ^ (t[1] >> shift)) & mask;
949 xor[2] = (t[2] ^ (t[2] >> shift)) & mask;
950 xor[3] = (t[3] ^ (t[3] >> shift)) & mask;
951 mask >>= 16;
952 t[0] ^= (xor[0] << shift);
953 t[1] ^= (xor[1] << shift);
954 t[2] ^= (xor[2] << shift);
955 t[3] ^= (xor[3] << shift);
956 shift += 12;
957 t[0] ^= xor[0];
958 t[1] ^= xor[1];
959 t[2] ^= xor[2];
960 t[3] ^= xor[3];
961 } while(shift < end);
962 _mzd_transpose_Nxjx64(t, 4);
963 i = m;
964 word* RESTRICT wk = dst;
965 do {
966 wk[0] = (uint16_t)t[0];
967 if (--i == 0)
968 break;
969 wk[rowstride_dst] = (uint16_t)t[1];
970 if (--i == 0)
971 break;
972 wk[2 * rowstride_dst] = (uint16_t)t[2];
973 if (--i == 0)
974 break;
975 wk[3 * rowstride_dst] = (uint16_t)t[3];
976 if (--i == 0)
977 break;
978 wk += 4 * rowstride_dst;
979 for(int shift = 16;; shift += 16) {
980 wk[0] = (uint16_t)(t[0] >> shift);
981 if (--i == 0)
982 break;
983 wk[rowstride_dst] = (uint16_t)(t[1] >> shift);
984 if (--i == 0)
985 break;
986 wk[2 * rowstride_dst] = (uint16_t)(t[2] >> shift);
987 if (--i == 0)
988 break;
989 wk[3 * rowstride_dst] = (uint16_t)(t[3] >> shift);
990 if (--i == 0)
991 break;
992 wk += 4 * rowstride_dst;
993 }
994 } while(0);
995 }
996
997 /**
998 * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 32.
999 *
1000 * \param dst First word of destination matrix.
1001 * \param src First word of source matrix.
1002 * \param rowstride_dst Rowstride of destination matrix.
1003 * \param rowstride_src Rowstride of source matrix.
1004 * \param n Number of rows in source matrix, must be less than or equal 32.
1005 * \param m Number of columns in source matrix, must be less than or equal 32.
1006 *
1007 * Rows of all matrices are expected to have offset zero
1008 * and lay entirely inside a single block.
1009 *
1010 * \note This function also works to transpose in-place.
1011 */
1012
_mzd_copy_transpose_le32xle32(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n,int m)1013 static inline void _mzd_copy_transpose_le32xle32(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
1014 {
1015 word const* RESTRICT wks = src;
1016 word t[16];
1017 int i = n;
1018 if (n > 16) {
1019 i -= 16;
1020 for (int j = 0; j < 16; ++j) {
1021 t[j] = *wks;
1022 wks += rowstride_src;
1023 }
1024 int j = 0;
1025 do {
1026 t[j++] |= (*wks << 32);
1027 wks += rowstride_src;
1028 } while(--i);
1029 } else {
1030 int j;
1031 for (j = 0; j < n; ++j) {
1032 t[j] = *wks;
1033 wks += rowstride_src;
1034 }
1035 for (; j < 16; ++j)
1036 t[j] = 0;
1037 }
1038 _mzd_transpose_Nxjx64(t, 16);
1039 int one_more = (m & 1);
1040 word* RESTRICT wk = dst;
1041 if (m > 16) {
1042 m -= 16;
1043 for (int j = 0; j < 16; j += 2) {
1044 *wk = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
1045 wk[rowstride_dst] = (t[j + 1] & 0xFFFF) | ((t[j + 1] >> 16) & 0xFFFF0000);
1046 wk += 2 * rowstride_dst;
1047 }
1048 for (int j = 1; j < m; j += 2) {
1049 *wk = ((t[j - 1] >> 16) & 0xFFFF) | ((t[j - 1] >> 32) & 0xFFFF0000);
1050 wk[rowstride_dst] = ((t[j] >> 16) & 0xFFFF) | ((t[j] >> 32) & 0xFFFF0000);
1051 wk += 2 * rowstride_dst;
1052 }
1053 if (one_more) {
1054 *wk = ((t[m - 1] >> 16) & 0xFFFF) | ((t[m - 1] >> 32) & 0xFFFF0000);
1055 }
1056 } else {
1057 for (int j = 1; j < m; j += 2) {
1058 *wk = (t[j - 1] & 0xFFFF) | ((t[j - 1] >> 16) & 0xFFFF0000);
1059 wk[rowstride_dst] = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
1060 wk += 2 * rowstride_dst;
1061 }
1062 if (one_more) {
1063 *wk = (t[m - 1] & 0xFFFF) | ((t[m - 1] >> 16) & 0xFFFF0000);
1064 }
1065 }
1066 }
1067
_mzd_copy_transpose_le64xle64(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n,int m)1068 static inline void _mzd_copy_transpose_le64xle64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
1069 {
1070 word const* RESTRICT wks = src;
1071 word t[64];
1072 int k;
1073 for (k = 0; k < n; ++k) {
1074 t[k] = *wks;
1075 wks += rowstride_src;
1076 }
1077 while(k < 64)
1078 t[k++] = 0;
1079 _mzd_copy_transpose_64x64(t, t, 1, 1);
1080 word* RESTRICT wk = dst;
1081 for (int k = 0; k < m; ++k) {
1082 *wk = t[k];
1083 wk += rowstride_dst;
1084 }
1085 return;
1086 }
1087
1088 void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp);
1089
_mzd_transpose(mzd_t * DST,mzd_t const * A)1090 mzd_t *_mzd_transpose(mzd_t *DST, mzd_t const *A) {
1091 assert(!mzd_is_windowed(DST) && !mzd_is_windowed(A));
1092 // We assume that there fit at least 64 rows in a block, if
1093 // that is the case then each block will contain a multiple
1094 // of 64 rows, since blockrows is a power of 2.
1095 assert(A->blockrows_log >= 6 && DST->blockrows_log >= 6);
1096
1097 rci_t nrows = A->nrows;
1098 rci_t ncols = A->ncols;
1099 rci_t maxsize = MAX(nrows, ncols);
1100
1101 word* RESTRICT fwd = mzd_first_row(DST);
1102 word const* RESTRICT fws = mzd_first_row(A);
1103
1104 if (maxsize >= 64) {
1105
1106 // This is the most non-intrusive way to deal with the case of multiple blocks.
1107 // Note that this code is VERY sensitive. ANY change to _mzd_transpose can easily
1108 // reduce the speed for small matrices (up to 64x64) by 5 to 10%.
1109 int const multiple_blocks = (A->flags | DST->flags) & mzd_flag_multiple_blocks;
1110 if (__M4RI_UNLIKELY(multiple_blocks)) {
1111 word* RESTRICT non_register_fwd;
1112 word const* RESTRICT non_register_fws;
1113 rci_t non_register_nrows;
1114 rci_t non_register_ncols;
1115 _mzd_transpose_multiblock(DST, A, &non_register_fwd, &non_register_fws, &non_register_nrows, &non_register_ncols);
1116 fwd = non_register_fwd;
1117 fws = non_register_fws;
1118 nrows = non_register_nrows;
1119 ncols = non_register_ncols;
1120 }
1121
1122 if (nrows >= 64) {
1123 /*
1124 * This is an interesting #if ...
1125 * I recommend to investigate the number of instructions, and the clocks per instruction,
1126 * as function of various sizes of the matrix (most likely especially the number of columns
1127 * (the size of a row) will have influence; also always use multiples of 64 or even 128),
1128 * for both cases below.
1129 *
1130 * To measure this run for example:
1131 *
1132 * ./bench_mzd -m 10 -x 10 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 32000 32000
1133 * ./bench_mzd -m 10 -x 100 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 128 10240
1134 * etc (increase -x for smaller sizes to get better accuracy).
1135 *
1136 * --Carlo Wood
1137 */
1138 #if 1
1139 int js = ncols & nrows & 64; // True if the total number of whole 64x64 matrices is odd.
1140 wi_t const rowstride_64_dst = 64 * DST->rowstride;
1141 word* RESTRICT fwd_current = fwd;
1142 word const* RESTRICT fws_current = fws;
1143 if (js) {
1144 js = 1;
1145 _mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
1146 if ((nrows | ncols) == 64) {
1147 __M4RI_DD_MZD(DST);
1148 return DST;
1149 }
1150 fwd_current += rowstride_64_dst;
1151 ++fws_current;
1152 }
1153 rci_t const whole_64cols = ncols / 64;
1154 // The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
1155 // This way it can be inlined without duplicating the amount of code that has to be loaded.
1156 word* RESTRICT fwd_delayed = NULL;
1157 word const* RESTRICT fws_delayed = NULL;
1158 int even = 0;
1159 while (1)
1160 {
1161 for (int j = js; j < whole_64cols; ++j) {
1162 if (!even) {
1163 fwd_delayed = fwd_current;
1164 fws_delayed = fws_current;
1165 } else {
1166 _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
1167 }
1168 fwd_current += rowstride_64_dst;
1169 ++fws_current;
1170 even = !even;
1171 }
1172 nrows -= 64;
1173 if (ncols % 64) {
1174 _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
1175 }
1176 fwd += 1;
1177 fws += 64 * A->rowstride;
1178 if (nrows < 64)
1179 break;
1180 js = 0;
1181 fws_current = fws;
1182 fwd_current = fwd;
1183 }
1184 #else
1185 // The same as the above, but without using _mzd_copy_transpose_64x64_2.
1186 wi_t const rowstride_64_dst = 64 * DST->rowstride;
1187 rci_t const whole_64cols = ncols / 64;
1188 assert(nrows >= 64);
1189 do {
1190 for (int j = 0; j < whole_64cols; ++j) {
1191 _mzd_copy_transpose_64x64(fwd + j * rowstride_64_dst, fws + j, DST->rowstride, A->rowstride);
1192 }
1193 nrows -= 64;
1194 if (ncols % 64) {
1195 _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
1196 }
1197 fwd += 1;
1198 fws += 64 * A->rowstride;
1199 } while(nrows >= 64);
1200 #endif
1201 }
1202
1203 if (nrows == 0) {
1204 __M4RI_DD_MZD(DST);
1205 return DST;
1206 }
1207
1208 // Transpose the remaining top rows. Now 0 < nrows < 64.
1209
1210 while (ncols >= 64)
1211 {
1212 _mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrows);
1213 ncols -= 64;
1214 fwd += 64 * DST->rowstride;
1215 fws += 1;
1216 }
1217
1218 if (ncols == 0) {
1219 __M4RI_DD_MZD(DST);
1220 return DST;
1221 }
1222
1223 maxsize = MAX(nrows, ncols);
1224 }
1225
1226 // Transpose the remaining corner. Now both 0 < nrows < 64 and 0 < ncols < 64.
1227
1228 if (maxsize <= 8) {
1229 _mzd_copy_transpose_le8xle8(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
1230 }
1231 else if (maxsize <= 16) {
1232 _mzd_copy_transpose_le16xle16(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
1233 }
1234 else if (maxsize <= 32) {
1235 _mzd_copy_transpose_le32xle32(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
1236 }
1237 else {
1238 _mzd_copy_transpose_le64xle64(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
1239 }
1240
1241 __M4RI_DD_MZD(DST);
1242 return DST;
1243 }
1244
_mzd_transpose_multiblock(mzd_t * DST,mzd_t const * A,word * RESTRICT * fwdp,word const * RESTRICT * fwsp,rci_t * nrowsp,rci_t * ncolsp)1245 void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp) {
1246
1247 rci_t nrows = A->nrows;
1248 rci_t ncols = A->ncols;
1249
1250 rci_t blockrows_dst = 1 << DST->blockrows_log; // The maximum number of rows in a block of DST.
1251 rci_t blockrows_src = 1 << A->blockrows_log; // The maximum number of rows in a block of A.
1252
1253 /* We're deviding the source matrix into blocks of multiples of 64x64, such that each
1254 * block fits entirely inside a single memory allocation block, both in the source
1255 * as well as the corresponding destination.
1256 *
1257 * <-------------------ncols----------------->
1258 * <---------blockrows_dst------->
1259 * .---------------------------------------------------------------.
1260 * |P ^ Matrix A:| . |Q . . . |<-^---- A->blocks[0].begin
1261 * | | | . | . . . | |
1262 * | | | . | . . . | |
1263 * | | |- - - - - -|- - - - - - - - - - - - - - - -| |
1264 * | | | . | . ^ . . | |
1265 * | | | . | .<64x64>. . | |
1266 * | | | . | . v . . | |
1267 * | | |- - - - - -|- - - - - - - - - - - - - - - -| |- blockrows_src
1268 * | | | . | . . . | |
1269 * | | | . | . . . | |
1270 * | | | . | . . . | |
1271 * | |nrows |- - - - - -|- - - - - - - - - - - - - - - -| |
1272 * | | | . | . . . | |
1273 * | | | . | . . . | |
1274 * | | | . | . . . | v
1275 * |===================+===========================================|
1276 * |R | | . |S . . . |<------ A->blocks[1].begin
1277 * | | | . | . . . |
1278 * | | | . | . . . |
1279 * | | |- - - - - -|- - - - - - - - - - - - - - - -|
1280 * | | | . | . . . |
1281 * | | | . | . . . |
1282 * | | | . | . . . |
1283 * | | |- - - - - -|- - - - - - - - - - - - - - - -|
1284 * | v | . | . . . |
1285 * | `-------------------------------------------|
1286 * | | |
1287 * | | |
1288 * | | |
1289 * | | |
1290 * | | |
1291 * `---------------------------------------------------------------'
1292 *
1293 * Imagine this also to be the memory map of DST, which then would be
1294 * mirrored in the diagonal line from the top/right to the bottom/left.
1295 * Then each of the squares P, Q, R and S lay entirely inside one
1296 * memory block in both the source as well as the destination matrix.
1297 * P and Q are really the same block for matrix A (as are R and S),
1298 * while P and R (and Q and S) are really the same block for DST.
1299 *
1300 * We're going to run over the top/right corners of each of these
1301 * memory "blocks" and then transpose it, one by one, running
1302 * from right to left and top to bottom. The last one (R) will be
1303 * done by the calling function, so we just return when we get there.
1304 */
1305
1306 rci_t R_top = (nrows >> A->blockrows_log) << A->blockrows_log;
1307 rci_t R_right = (ncols >> DST->blockrows_log) << DST->blockrows_log;
1308 for (rci_t col = 0; col < ncols; col += blockrows_dst) {
1309 rci_t end = (col == R_right) ? R_top : nrows;
1310 for (rci_t row = 0; row < end; row += blockrows_src) {
1311
1312 rci_t nrowsb = (row < R_top) ? blockrows_src : (nrows - R_top);
1313 rci_t ncolsb = (col < R_right) ? blockrows_dst : (ncols - R_right);
1314 word const* RESTRICT fws = mzd_row(A, row) + col / m4ri_radix;
1315 word* RESTRICT fwd = mzd_row(DST, col) + row / m4ri_radix;
1316
1317 // The following code is (almost) duplicated from _mzd_transpose.
1318 if (nrowsb >= 64) {
1319
1320 int js = ncolsb & nrowsb & 64; // True if the total number of whole 64x64 matrices is odd.
1321 wi_t const rowstride_64_dst = 64 * DST->rowstride;
1322 word* RESTRICT fwd_current = fwd;
1323 word const* RESTRICT fws_current = fws;
1324 if (js) {
1325 js = 1;
1326 _mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
1327 fwd_current += rowstride_64_dst;
1328 ++fws_current;
1329 }
1330 rci_t const whole_64cols = ncolsb / 64;
1331 // The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
1332 // This way it can be inlined without duplicating the amount of code that has to be loaded.
1333 word* RESTRICT fwd_delayed = NULL;
1334 word const* RESTRICT fws_delayed = NULL;
1335 int even = 0;
1336 while (1)
1337 {
1338 for (int j = js; j < whole_64cols; ++j) {
1339 if (!even) {
1340 fwd_delayed = fwd_current;
1341 fws_delayed = fws_current;
1342 } else {
1343 _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
1344 }
1345 fwd_current += rowstride_64_dst;
1346 ++fws_current;
1347 even = !even;
1348 }
1349 nrowsb -= 64;
1350 if (ncolsb % 64) {
1351 _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncolsb % 64);
1352 }
1353 fwd += 1;
1354 fws += 64 * A->rowstride;
1355 if (nrowsb < 64)
1356 break;
1357 js = 0;
1358 fws_current = fws;
1359 fwd_current = fwd;
1360 }
1361 }
1362
1363 if (nrowsb == 0)
1364 continue;
1365
1366 // Transpose the remaining top rows. Now 0 < nrowsb < 64.
1367
1368 while (ncolsb >= 64)
1369 {
1370 _mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrowsb);
1371 ncolsb -= 64;
1372 fwd += 64 * DST->rowstride;
1373 fws += 1;
1374 }
1375
1376 // This is true because if it wasn't then nrowsb has to be 0 and we continued before already.
1377 assert(ncolsb == 0);
1378 }
1379 }
1380
1381 *nrowsp = nrows - R_top;
1382 *ncolsp = ncols - R_right;
1383 if (R_top < nrows)
1384 *fwsp = mzd_row(A, R_top) + R_right / m4ri_radix;
1385 if (R_right < ncols)
1386 *fwdp = mzd_row(DST, R_right) + R_top / m4ri_radix;
1387 }
1388
mzd_transpose(mzd_t * DST,mzd_t const * A)1389 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A) {
1390 if (DST == NULL) {
1391 DST = mzd_init( A->ncols, A->nrows );
1392 } else if (__M4RI_UNLIKELY(DST->nrows != A->ncols || DST->ncols != A->nrows)) {
1393 m4ri_die("mzd_transpose: Wrong size for return matrix.\n");
1394 } else {
1395 /** it seems this is taken care of in the subroutines, re-enable if running into problems **/
1396 //mzd_set_ui(DST,0);
1397 }
1398
1399 if(A->nrows == 0 || A->ncols == 0)
1400 return mzd_copy(DST, A);
1401
1402 if (__M4RI_LIKELY(!mzd_is_windowed(DST) && !mzd_is_windowed(A)))
1403 return _mzd_transpose(DST, A);
1404 int A_windowed = mzd_is_windowed(A);
1405 if (A_windowed)
1406 A = mzd_copy(NULL, A);
1407 if (__M4RI_LIKELY(!mzd_is_windowed(DST)))
1408 _mzd_transpose(DST, A);
1409 else {
1410 mzd_t *D = mzd_init(DST->nrows, DST->ncols);
1411 _mzd_transpose(D, A);
1412 mzd_copy(DST, D);
1413 mzd_free(D);
1414 }
1415 if (A_windowed)
1416 mzd_free((mzd_t*)A);
1417 return DST;
1418 }
1419
mzd_mul_naive(mzd_t * C,mzd_t const * A,mzd_t const * B)1420 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1421 if (C == NULL) {
1422 C = mzd_init(A->nrows, B->ncols);
1423 } else {
1424 if (C->nrows != A->nrows || C->ncols != B->ncols) {
1425 m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
1426 }
1427 }
1428 if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
1429 mzd_t *BT = mzd_transpose(NULL, B);
1430 _mzd_mul_naive(C, A, BT, 1);
1431 mzd_free (BT);
1432 } else {
1433 _mzd_mul_va(C, A, B, 1);
1434 }
1435 return C;
1436 }
1437
mzd_addmul_naive(mzd_t * C,mzd_t const * A,mzd_t const * B)1438 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1439 if (C->nrows != A->nrows || C->ncols != B->ncols) {
1440 m4ri_die("mzd_addmul_naive: Provided return matrix has wrong dimensions.\n");
1441 }
1442
1443 if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
1444 mzd_t *BT = mzd_transpose(NULL, B);
1445 _mzd_mul_naive(C, A, BT, 0);
1446 mzd_free (BT);
1447 } else {
1448 _mzd_mul_va(C, A, B, 0);
1449 }
1450 return C;
1451 }
1452
_mzd_mul_naive(mzd_t * C,mzd_t const * A,mzd_t const * B,const int clear)1453 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, const int clear) {
1454 wi_t eol;
1455 word *a, *b, *c;
1456
1457 if (clear) {
1458 word const mask_end = C->high_bitmask;
1459 /* improves performance on x86_64 but is not cross plattform */
1460 /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1461 for (rci_t i = 0; i < C->nrows; ++i) {
1462 wi_t j = 0;
1463 for (; j < C->width - 1; ++j) {
1464 C->rows[i][j] = 0;
1465 }
1466 C->rows[i][j] &= ~mask_end;
1467 }
1468 }
1469
1470 if(C->ncols % m4ri_radix) {
1471 eol = (C->width - 1);
1472 } else {
1473 eol = (C->width);
1474 }
1475
1476 word parity[64];
1477 for (int i = 0; i < 64; ++i) {
1478 parity[i] = 0;
1479 }
1480 wi_t const wide = A->width;
1481 int const blocksize = __M4RI_MUL_BLOCKSIZE;
1482 for (rci_t start = 0; start + blocksize <= C->nrows; start += blocksize) {
1483 for (rci_t i = start; i < start + blocksize; ++i) {
1484 a = A->rows[i];
1485 c = C->rows[i];
1486 for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
1487 for (int k = 0; k < m4ri_radix; ++k) {
1488 b = B->rows[j + k];
1489 parity[k] = a[0] & b[0];
1490 for (wi_t ii = wide - 1; ii >= 1; --ii)
1491 parity[k] ^= a[ii] & b[ii];
1492 }
1493 c[j / m4ri_radix] ^= m4ri_parity64(parity);
1494 }
1495
1496 if (eol != C->width) {
1497 word const mask_end = C->high_bitmask;
1498 /* improves performance on x86_64 but is not cross plattform */
1499 /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1500 for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
1501 b = B->rows[m4ri_radix * eol + k];
1502 parity[k] = a[0] & b[0];
1503 for (wi_t ii = 1; ii < A->width; ++ii)
1504 parity[k] ^= a[ii] & b[ii];
1505 }
1506 c[eol] ^= m4ri_parity64(parity) & mask_end;
1507 }
1508 }
1509 }
1510
1511 for (rci_t i = C->nrows - (C->nrows % blocksize); i < C->nrows; ++i) {
1512 a = A->rows[i];
1513 c = C->rows[i];
1514 for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
1515 for (int k = 0; k < m4ri_radix; ++k) {
1516 b = B->rows[j+k];
1517 parity[k] = a[0] & b[0];
1518 for (wi_t ii = wide - 1; ii >= 1; --ii)
1519 parity[k] ^= a[ii] & b[ii];
1520 }
1521 c[j/m4ri_radix] ^= m4ri_parity64(parity);
1522 }
1523
1524 if (eol != C->width) {
1525 word const mask_end = C->high_bitmask;
1526 /* improves performance on x86_64 but is not cross plattform */
1527 /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1528 for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
1529 b = B->rows[m4ri_radix * eol + k];
1530 parity[k] = a[0] & b[0];
1531 for (wi_t ii = 1; ii < A->width; ++ii)
1532 parity[k] ^= a[ii] & b[ii];
1533 }
1534 c[eol] ^= m4ri_parity64(parity) & mask_end;
1535 }
1536 }
1537
1538 __M4RI_DD_MZD(C);
1539 return C;
1540 }
1541
_mzd_mul_va(mzd_t * C,mzd_t const * v,mzd_t const * A,int const clear)1542 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear) {
1543 if(clear)
1544 mzd_set_ui(C, 0);
1545
1546 rci_t const m = v->nrows;
1547 rci_t const n = v->ncols;
1548
1549 for(rci_t i = 0; i < m; ++i)
1550 for(rci_t j = 0; j < n; ++j)
1551 if (mzd_read_bit(v,i,j))
1552 mzd_combine(C,i,0, C,i,0, A,j,0);
1553
1554 __M4RI_DD_MZD(C);
1555 return C;
1556 }
1557
mzd_randomize(mzd_t * A)1558 void mzd_randomize(mzd_t *A) {
1559 wi_t const width = A->width - 1;
1560 word const mask_end = A->high_bitmask;
1561 for(rci_t i = 0; i < A->nrows; ++i) {
1562 for(wi_t j = 0; j < width; ++j)
1563 A->rows[i][j] = m4ri_random_word();
1564 A->rows[i][width] ^= (A->rows[i][width] ^ m4ri_random_word()) & mask_end;
1565 }
1566
1567 __M4RI_DD_MZD(A);
1568 }
1569
mzd_randomize_custom(mzd_t * A,m4ri_random_callback rc,void * data)1570 void mzd_randomize_custom(mzd_t *A, m4ri_random_callback rc, void* data) {
1571 wi_t const width = A->width - 1;
1572 word const mask_end = A->high_bitmask;
1573 for(rci_t i = 0; i < A->nrows; ++i) {
1574 for(wi_t j = 0; j < width; ++j)
1575 A->rows[i][j] = rc(data);
1576 A->rows[i][width] ^= (A->rows[i][width] ^ rc(data)) & mask_end;
1577 }
1578
1579 __M4RI_DD_MZD(A);
1580 }
1581
mzd_set_ui(mzd_t * A,unsigned int value)1582 void mzd_set_ui( mzd_t *A, unsigned int value) {
1583 word const mask_end = A->high_bitmask;
1584
1585 for (rci_t i = 0; i < A->nrows; ++i) {
1586 word *row = A->rows[i];
1587 for(wi_t j = 0; j < A->width - 1; ++j)
1588 row[j] = 0;
1589 row[A->width - 1] &= ~mask_end;
1590 }
1591
1592 if(value % 2 == 0) {
1593 __M4RI_DD_MZD(A);
1594 return;
1595 }
1596
1597 rci_t const stop = MIN(A->nrows, A->ncols);
1598 for (rci_t i = 0; i < stop; ++i) {
1599 mzd_write_bit(A, i, i, 1);
1600 }
1601
1602 __M4RI_DD_MZD(A);
1603 }
1604
mzd_equal(mzd_t const * A,mzd_t const * B)1605 int mzd_equal(mzd_t const *A, mzd_t const *B) {
1606 if (A->nrows != B->nrows) return FALSE;
1607 if (A->ncols != B->ncols) return FALSE;
1608 if (A == B) return TRUE;
1609
1610 wi_t Awidth = A->width - 1;
1611
1612 for (rci_t i = 0; i < A->nrows; ++i) {
1613 for (wi_t j = 0; j < Awidth; ++j) {
1614 if (A->rows[i][j] != B->rows[i][j])
1615 return FALSE;
1616 }
1617 }
1618
1619 word const mask_end = A->high_bitmask;
1620 for (rci_t i = 0; i < A->nrows; ++i) {
1621 if (((A->rows[i][Awidth] ^ B->rows[i][Awidth]) & mask_end))
1622 return FALSE;
1623 }
1624 return TRUE;
1625 }
1626
mzd_cmp(mzd_t const * A,mzd_t const * B)1627 int mzd_cmp(mzd_t const *A, mzd_t const *B) {
1628 if(A->nrows < B->nrows) return -1;
1629 if(B->nrows < A->nrows) return 1;
1630 if(A->ncols < B->ncols) return -1;
1631 if(B->ncols < A->ncols) return 1;
1632
1633 const word mask_end = A->high_bitmask;
1634 const wi_t n = A->width-1;
1635
1636 /* Columns with large index are "larger", but rows with small index
1637 are more important than with large index. */
1638
1639 for(rci_t i=0; i<A->nrows; i++) {
1640 if ((A->rows[i][n]&mask_end) < (B->rows[i][n]&mask_end))
1641 return -1;
1642 else if ((A->rows[i][n]&mask_end) > (B->rows[i][n]&mask_end))
1643 return 1;
1644
1645 for(wi_t j=n-1; j>=0; j--) {
1646 if (A->rows[i][j] < B->rows[i][j])
1647 return -1;
1648 else if (A->rows[i][j] > B->rows[i][j])
1649 return 1;
1650 }
1651 }
1652 return 0;
1653 }
1654
mzd_copy(mzd_t * N,mzd_t const * P)1655 mzd_t *mzd_copy(mzd_t *N, mzd_t const *P) {
1656 if (N == P)
1657 return N;
1658
1659 if (N == NULL) {
1660 N = mzd_init(P->nrows, P->ncols);
1661 } else {
1662 if (N->nrows < P->nrows || N->ncols < P->ncols)
1663 m4ri_die("mzd_copy: Target matrix is too small.");
1664 }
1665 word *p_truerow, *n_truerow;
1666 wi_t const wide = P->width - 1;
1667 word mask_end = P->high_bitmask;
1668 for (rci_t i = 0; i < P->nrows; ++i) {
1669 p_truerow = P->rows[i];
1670 n_truerow = N->rows[i];
1671 for (wi_t j = 0; j < wide; ++j)
1672 n_truerow[j] = p_truerow[j];
1673 n_truerow[wide] = (n_truerow[wide] & ~mask_end) | (p_truerow[wide] & mask_end);
1674 }
1675 __M4RI_DD_MZD(N);
1676 return N;
1677 }
1678
1679 /* This is sometimes called augment */
mzd_concat(mzd_t * C,mzd_t const * A,mzd_t const * B)1680 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1681 if (A->nrows != B->nrows) {
1682 m4ri_die("mzd_concat: Bad arguments to concat!\n");
1683 }
1684
1685 if (C == NULL) {
1686 C = mzd_init(A->nrows, A->ncols + B->ncols);
1687 } else if (C->nrows != A->nrows || C->ncols != (A->ncols + B->ncols)) {
1688 m4ri_die("mzd_concat: C has wrong dimension!\n");
1689 }
1690
1691 for (rci_t i = 0; i < A->nrows; ++i) {
1692 word *dst_truerow = C->rows[i];
1693 word *src_truerow = A->rows[i];
1694 for (wi_t j = 0; j < A->width; ++j) {
1695 dst_truerow[j] = src_truerow[j];
1696 }
1697 }
1698
1699 for (rci_t i = 0; i < B->nrows; ++i) {
1700 for (rci_t j = 0; j < B->ncols; ++j) {
1701 mzd_write_bit(C, i, j + A->ncols, mzd_read_bit(B, i, j));
1702 }
1703 }
1704
1705 __M4RI_DD_MZD(C);
1706 return C;
1707 }
1708
mzd_stack(mzd_t * C,mzd_t const * A,mzd_t const * B)1709 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1710 if (A->ncols != B->ncols) {
1711 m4ri_die("mzd_stack: A->ncols (%d) != B->ncols (%d)!\n", A->ncols, B->ncols);
1712 }
1713
1714 if (C == NULL) {
1715 C = mzd_init(A->nrows + B->nrows, A->ncols);
1716 } else if (C->nrows != (A->nrows + B->nrows) || C->ncols != A->ncols) {
1717 m4ri_die("mzd_stack: C has wrong dimension!\n");
1718 }
1719
1720 for(rci_t i = 0; i < A->nrows; ++i) {
1721 word *src_truerow = A->rows[i];
1722 word *dst_truerow = C->rows[i];
1723 for (wi_t j = 0; j < A->width; ++j) {
1724 dst_truerow[j] = src_truerow[j];
1725 }
1726 }
1727
1728 for(rci_t i = 0; i < B->nrows; ++i) {
1729 word *dst_truerow = C->rows[A->nrows + i];
1730 word *src_truerow = B->rows[i];
1731 for (wi_t j = 0; j < B->width; ++j) {
1732 dst_truerow[j] = src_truerow[j];
1733 }
1734 }
1735
1736 __M4RI_DD_MZD(C);
1737 return C;
1738 }
1739
mzd_invert_naive(mzd_t * INV,mzd_t const * A,mzd_t const * I)1740 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I) {
1741 mzd_t *H;
1742
1743 H = mzd_concat(NULL, A, I);
1744
1745 rci_t x = mzd_echelonize_naive(H, TRUE);
1746
1747 if (x == 0) {
1748 mzd_free(H);
1749 return NULL;
1750 }
1751
1752 INV = mzd_submatrix(INV, H, 0, A->ncols, A->nrows, 2 * A->ncols);
1753
1754 mzd_free(H);
1755
1756 __M4RI_DD_MZD(INV);
1757 return INV;
1758 }
1759
mzd_add(mzd_t * ret,mzd_t const * left,mzd_t const * right)1760 mzd_t *mzd_add(mzd_t *ret, mzd_t const *left, mzd_t const *right) {
1761 if (left->nrows != right->nrows || left->ncols != right->ncols) {
1762 m4ri_die("mzd_add: rows and columns must match.\n");
1763 }
1764 if (ret == NULL) {
1765 ret = mzd_init(left->nrows, left->ncols);
1766 } else if (ret != left) {
1767 if (ret->nrows != left->nrows || ret->ncols != left->ncols) {
1768 m4ri_die("mzd_add: rows and columns of returned matrix must match.\n");
1769 }
1770 }
1771 return _mzd_add(ret, left, right);
1772 }
1773
_mzd_add(mzd_t * C,mzd_t const * A,mzd_t const * B)1774 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1775 rci_t const nrows = MIN(MIN(A->nrows, B->nrows), C->nrows);
1776
1777 if (C == B) { //swap
1778 mzd_t const *tmp = A;
1779 A = B;
1780 B = tmp;
1781 }
1782
1783 word const mask_end = C->high_bitmask;
1784
1785 switch(A->width) {
1786 case 0:
1787 return C;
1788 case 1:
1789 for(rci_t i = 0; i < nrows; ++i) {
1790 C->rows[i][0] ^= ((A->rows[i][0] ^ B->rows[i][0] ^ C->rows[i][0]) & mask_end);
1791 }
1792 break;
1793 case 2:
1794 for(rci_t i = 0; i < nrows; ++i) {
1795 C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1796 C->rows[i][1] ^= ((A->rows[i][1] ^ B->rows[i][1] ^ C->rows[i][1]) & mask_end);
1797 }
1798 break;
1799 case 3:
1800 for(rci_t i = 0; i < nrows; ++i) {
1801 C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1802 C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1803 C->rows[i][2] ^= ((A->rows[i][2] ^ B->rows[i][2] ^ C->rows[i][2]) & mask_end);
1804 }
1805 break;
1806 case 4:
1807 for(rci_t i = 0; i < nrows; ++i) {
1808 C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1809 C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1810 C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1811 C->rows[i][3] ^= ((A->rows[i][3] ^ B->rows[i][3] ^ C->rows[i][3]) & mask_end);
1812 }
1813 break;
1814 case 5:
1815 for(rci_t i = 0; i < nrows; ++i) {
1816 C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1817 C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1818 C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1819 C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1820 C->rows[i][4] ^= ((A->rows[i][4] ^ B->rows[i][4] ^ C->rows[i][4]) & mask_end);
1821 }
1822 break;
1823 case 6:
1824 for(rci_t i = 0; i < nrows; ++i) {
1825 C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1826 C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1827 C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1828 C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1829 C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1830 C->rows[i][5] ^= ((A->rows[i][5] ^ B->rows[i][5] ^ C->rows[i][5]) & mask_end);
1831 }
1832 break;
1833 case 7:
1834 for(rci_t i = 0; i < nrows; ++i) {
1835 C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1836 C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1837 C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1838 C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1839 C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1840 C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
1841 C->rows[i][6] ^= ((A->rows[i][6] ^ B->rows[i][6] ^ C->rows[i][6]) & mask_end);
1842 }
1843 break;
1844 case 8:
1845 for(rci_t i = 0; i < nrows; ++i) {
1846 C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1847 C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1848 C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1849 C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1850 C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1851 C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
1852 C->rows[i][6] = A->rows[i][6] ^ B->rows[i][6];
1853 C->rows[i][7] ^= ((A->rows[i][7] ^ B->rows[i][7] ^ C->rows[i][7]) & mask_end);
1854 }
1855 break;
1856
1857 default:
1858 for(rci_t i = 0; i < nrows; ++i) {
1859 mzd_combine_even(C,i,0, A,i,0, B,i,0);
1860 }
1861 }
1862
1863 __M4RI_DD_MZD(C);
1864 return C;
1865 }
1866
mzd_submatrix(mzd_t * S,mzd_t const * M,rci_t const startrow,rci_t const startcol,rci_t const endrow,rci_t const endcol)1867 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const startrow, rci_t const startcol, rci_t const endrow, rci_t const endcol) {
1868 rci_t const nrows = endrow - startrow;
1869 rci_t const ncols = endcol - startcol;
1870
1871 if (S == NULL) {
1872 S = mzd_init(nrows, ncols);
1873 } else if( (S->nrows < nrows) | (S->ncols < ncols) ) {
1874 m4ri_die("mzd_submatrix: got S with dimension %d x %d but expected %d x %d\n", S->nrows, S->ncols, nrows, ncols);
1875 }
1876
1877 if (startcol % m4ri_radix == 0) {
1878
1879 wi_t const startword = startcol / m4ri_radix;
1880 /* we start at the beginning of a word */
1881 if(ncols / m4ri_radix != 0) {
1882 for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
1883 memcpy(S->rows[i], M->rows[x] + startword, sizeof(word) * (ncols / m4ri_radix));
1884 }
1885 }
1886 if (ncols % m4ri_radix) {
1887 word const mask_end = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
1888 for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
1889 /* process remaining bits */
1890 word temp = M->rows[x][startword + ncols / m4ri_radix] & mask_end;
1891 S->rows[i][ncols / m4ri_radix] = temp;
1892 }
1893 }
1894 } else {
1895 wi_t j;
1896 for(rci_t i=0; i<nrows; i++) {
1897 for(j=0; j+m4ri_radix<ncols; j+=m4ri_radix)
1898 S->rows[i][j/m4ri_radix] = mzd_read_bits(M, startrow+i, startcol+j, m4ri_radix);
1899 S->rows[i][j/m4ri_radix] &= ~S->high_bitmask;
1900 S->rows[i][j/m4ri_radix] |= mzd_read_bits(M, startrow+i, startcol+j, ncols - j) & S->high_bitmask;
1901 }
1902 }
1903 __M4RI_DD_MZD(S);
1904 return S;
1905 }
1906
mzd_col_swap(mzd_t * M,rci_t const cola,rci_t const colb)1907 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb) {
1908 if (cola == colb)
1909 return;
1910
1911 rci_t const _cola = cola;
1912 rci_t const _colb = colb;
1913
1914 wi_t const a_word = _cola / m4ri_radix;
1915 wi_t const b_word = _colb / m4ri_radix;
1916
1917 int const a_bit = _cola % m4ri_radix;
1918 int const b_bit = _colb % m4ri_radix;
1919
1920 word* RESTRICT ptr = mzd_first_row(M);
1921 int max_bit = MAX(a_bit, b_bit);
1922 int count = mzd_rows_in_block(M, 0);
1923 assert(count > 0);
1924 int min_bit = a_bit + b_bit - max_bit;
1925 int block = 0;
1926 int offset = max_bit - min_bit;
1927 word mask = m4ri_one << min_bit;
1928
1929 if (a_word == b_word) {
1930 while(1) {
1931 ptr += a_word;
1932 int fast_count = count / 4;
1933 int rest_count = count - 4 * fast_count;
1934 word xor[4];
1935 wi_t const rowstride = M->rowstride;
1936 while (fast_count--) {
1937 xor[0] = ptr[0];
1938 xor[1] = ptr[rowstride];
1939 xor[2] = ptr[2 * rowstride];
1940 xor[3] = ptr[3 * rowstride];
1941 xor[0] ^= xor[0] >> offset;
1942 xor[1] ^= xor[1] >> offset;
1943 xor[2] ^= xor[2] >> offset;
1944 xor[3] ^= xor[3] >> offset;
1945 xor[0] &= mask;
1946 xor[1] &= mask;
1947 xor[2] &= mask;
1948 xor[3] &= mask;
1949 xor[0] |= xor[0] << offset;
1950 xor[1] |= xor[1] << offset;
1951 xor[2] |= xor[2] << offset;
1952 xor[3] |= xor[3] << offset;
1953 ptr[0] ^= xor[0];
1954 ptr[rowstride] ^= xor[1];
1955 ptr[2 * rowstride] ^= xor[2];
1956 ptr[3 * rowstride] ^= xor[3];
1957 ptr += 4 * rowstride;
1958 }
1959 while (rest_count--) {
1960 word xor = *ptr;
1961 xor ^= xor >> offset;
1962 xor &= mask;
1963 *ptr ^= xor | (xor << offset);
1964 ptr += rowstride;
1965 }
1966 if ((count = mzd_rows_in_block(M, ++block)) <= 0)
1967 break;
1968 ptr = mzd_first_row_next_block(M, block);
1969 }
1970 } else {
1971 word* RESTRICT min_ptr;
1972 wi_t max_offset;
1973 if (min_bit == a_bit) {
1974 min_ptr = ptr + a_word;
1975 max_offset = b_word - a_word;
1976 } else {
1977 min_ptr = ptr + b_word;
1978 max_offset = a_word - b_word;
1979 }
1980 while(1) {
1981 wi_t const rowstride = M->rowstride;
1982 while(count--) {
1983 word xor = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
1984 min_ptr[0] ^= xor;
1985 min_ptr[max_offset] ^= xor << offset;
1986 min_ptr += rowstride;
1987 }
1988 if ((count = mzd_rows_in_block(M, ++block)) <= 0)
1989 break;
1990 ptr = mzd_first_row_next_block(M, block);
1991 if (min_bit == a_bit)
1992 min_ptr = ptr + a_word;
1993 else
1994 min_ptr = ptr + b_word;
1995 }
1996 }
1997
1998 __M4RI_DD_MZD(M);
1999 }
2000
mzd_is_zero(mzd_t const * A)2001 int mzd_is_zero(mzd_t const *A) {
2002 word status = 0;
2003 word mask_end = A->high_bitmask;
2004 for (rci_t i = 0; i < A->nrows; ++i) {
2005 for (wi_t j = 0; j < A->width - 1; ++j)
2006 status |= A->rows[i][j];
2007 status |= A->rows[i][A->width - 1] & mask_end;
2008 if(status)
2009 return 0;
2010 }
2011 return !status;
2012 }
2013
mzd_copy_row(mzd_t * B,rci_t i,mzd_t const * A,rci_t j)2014 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j) {
2015 assert(B->ncols >= A->ncols);
2016 wi_t const width = MIN(B->width, A->width) - 1;
2017
2018 word const *a = A->rows[j];
2019 word *b = B->rows[i];
2020
2021 word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
2022
2023 if (width != 0) {
2024 for(wi_t k = 0; k < width; ++k)
2025 b[k] = a[k];
2026 b[width] = (b[width] & ~mask_end) | (a[width] & mask_end);
2027
2028 } else {
2029 b[0] = (a[0]& mask_end) | (b[0] & ~mask_end);
2030 }
2031
2032 __M4RI_DD_ROW(B, i);
2033 }
2034
2035
mzd_find_pivot(mzd_t const * A,rci_t start_row,rci_t start_col,rci_t * r,rci_t * c)2036 int mzd_find_pivot(mzd_t const *A, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c) {
2037 rci_t const nrows = A->nrows;
2038 rci_t const ncols = A->ncols;
2039 word data = 0;
2040 rci_t row_candidate = 0;
2041 if(A->ncols - start_col < m4ri_radix) {
2042 for(rci_t j = start_col; j < A->ncols; j += m4ri_radix) {
2043 int const length = MIN(m4ri_radix, ncols - j);
2044 for(rci_t i = start_row; i < nrows; ++i) {
2045 word const curr_data = mzd_read_bits(A, i, j, length);
2046 if (m4ri_lesser_LSB(curr_data, data)) {
2047 row_candidate = i;
2048 data = curr_data;
2049 }
2050 }
2051 if(data) {
2052 *r = row_candidate;
2053 for(int l = 0; l < length; ++l) {
2054 if(__M4RI_GET_BIT(data, l)) {
2055 *c = j + l;
2056 break;
2057 }
2058 }
2059 __M4RI_DD_RCI(*r);
2060 __M4RI_DD_RCI(*c);
2061 __M4RI_DD_INT(1);
2062 return 1;
2063 }
2064 }
2065 } else {
2066 /* we definitely have more than one word */
2067 /* handle first word */
2068 int const bit_offset = (start_col % m4ri_radix);
2069 wi_t const word_offset = start_col / m4ri_radix;
2070 word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-bit_offset);
2071 for(rci_t i = start_row; i < nrows; ++i) {
2072 word const curr_data = A->rows[i][word_offset] & mask_begin;
2073 if (m4ri_lesser_LSB(curr_data, data)) {
2074 row_candidate = i;
2075 data = curr_data;
2076 if(__M4RI_GET_BIT(data,bit_offset)) {
2077 break;
2078 }
2079 }
2080 }
2081 if(data) {
2082 *r = row_candidate;
2083 data >>= bit_offset;
2084 assert(data);
2085 for(int l = 0; l < (m4ri_radix - bit_offset); ++l) {
2086 if(__M4RI_GET_BIT(data, l)) {
2087 *c = start_col + l;
2088 break;
2089 }
2090 }
2091 __M4RI_DD_RCI(*r);
2092 __M4RI_DD_RCI(*c);
2093 __M4RI_DD_INT(1);
2094 return 1;
2095 }
2096 /* handle complete words */
2097 for(wi_t wi = word_offset + 1; wi < A->width - 1; ++wi) {
2098 for(rci_t i = start_row; i < nrows; ++i) {
2099 word const curr_data = A->rows[i][wi];
2100 if (m4ri_lesser_LSB(curr_data, data)) {
2101 row_candidate = i;
2102 data = curr_data;
2103 if(__M4RI_GET_BIT(data, 0))
2104 break;
2105 }
2106 }
2107 if(data) {
2108 *r = row_candidate;
2109 for(int l = 0; l < m4ri_radix; ++l) {
2110 if(__M4RI_GET_BIT(data, l)) {
2111 *c = wi * m4ri_radix + l;
2112 break;
2113 }
2114 }
2115 __M4RI_DD_RCI(*r);
2116 __M4RI_DD_RCI(*c);
2117 __M4RI_DD_INT(1);
2118 return 1;
2119 }
2120 }
2121 /* handle last word */
2122 int const end_offset = (A->ncols % m4ri_radix) ? (A->ncols % m4ri_radix) : m4ri_radix;
2123 word const mask_end = __M4RI_LEFT_BITMASK(end_offset % m4ri_radix);
2124 wi_t wi = A->width - 1;
2125 for(rci_t i = start_row; i < nrows; ++i) {
2126 word const curr_data = A->rows[i][wi] & mask_end;
2127 if (m4ri_lesser_LSB(curr_data, data)) {
2128 row_candidate = i;
2129 data = curr_data;
2130 if(__M4RI_GET_BIT(data,0))
2131 break;
2132 }
2133 }
2134 if(data) {
2135 *r = row_candidate;
2136 for(int l = 0; l < end_offset; ++l) {
2137 if(__M4RI_GET_BIT(data, l)) {
2138 *c = wi * m4ri_radix + l;
2139 break;
2140 }
2141 }
2142 __M4RI_DD_RCI(*r);
2143 __M4RI_DD_RCI(*c);
2144 __M4RI_DD_INT(1);
2145 return 1;
2146 }
2147 }
2148 __M4RI_DD_RCI(*r);
2149 __M4RI_DD_RCI(*c);
2150 __M4RI_DD_INT(0);
2151 return 0;
2152 }
2153
2154
2155 #define MASK(c) (((uint64_t)(-1)) / (__M4RI_TWOPOW(__M4RI_TWOPOW(c)) + 1))
2156 #define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (__M4RI_TWOPOW(c))) & MASK(c))
2157
m4ri_bitcount(word w)2158 static inline int m4ri_bitcount(word w) {
2159 uint64_t n = __M4RI_CONVERT_TO_UINT64_T(w);
2160 n = COUNT(n, 0);
2161 n = COUNT(n, 1);
2162 n = COUNT(n, 2);
2163 n = COUNT(n, 3);
2164 n = COUNT(n, 4);
2165 n = COUNT(n, 5);
2166 return (int)n;
2167 }
2168
2169
_mzd_density(mzd_t const * A,wi_t res,rci_t r,rci_t c)2170 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c) {
2171 size_t count = 0;
2172 size_t total = 0;
2173
2174 if(A->width == 1) {
2175 for(rci_t i = r; i < A->nrows; ++i)
2176 for(rci_t j = c; j < A->ncols; ++j)
2177 if(mzd_read_bit(A, i, j))
2178 ++count;
2179 return ((double)count)/(1.0 * A->ncols * A->nrows);
2180 }
2181
2182 if(res == 0)
2183 res = A->width / 100;
2184 if (res < 1)
2185 res = 1;
2186
2187 for(rci_t i = r; i < A->nrows; ++i) {
2188 word *truerow = A->rows[i];
2189 for(rci_t j = c; j < m4ri_radix; ++j)
2190 if(mzd_read_bit(A, i, j))
2191 ++count;
2192 total += m4ri_radix;
2193
2194 for(wi_t j = MAX(1, c / m4ri_radix); j < A->width - 1; j += res) {
2195 count += m4ri_bitcount(truerow[j]);
2196 total += m4ri_radix;
2197 }
2198 for(int j = 0; j < A->ncols % m4ri_radix; ++j)
2199 if(mzd_read_bit(A, i, m4ri_radix * (A->ncols / m4ri_radix) + j))
2200 ++count;
2201 total += A->ncols % m4ri_radix;
2202 }
2203
2204 return (double)count / total;
2205 }
2206
mzd_density(mzd_t const * A,wi_t res)2207 double mzd_density(mzd_t const *A, wi_t res) {
2208 return _mzd_density(A, res, 0, 0);
2209 }
2210
mzd_first_zero_row(mzd_t const * A)2211 rci_t mzd_first_zero_row(mzd_t const *A) {
2212 word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
2213 wi_t const end = A->width - 1;
2214 word *row;
2215
2216 for(rci_t i = A->nrows - 1; i >= 0; --i) {
2217 row = A->rows[i];
2218 word tmp = row[0];
2219 for (wi_t j = 1; j < end; ++j)
2220 tmp |= row[j];
2221 tmp |= row[end] & mask_end;
2222 if(tmp) {
2223 __M4RI_DD_INT(i + 1);
2224 return i + 1;
2225 }
2226 }
2227 __M4RI_DD_INT(0);
2228 return 0;
2229 }
2230
mzd_extract_u(mzd_t * U,mzd_t const * A)2231 mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A) {
2232 rci_t k = MIN(A->nrows, A->ncols);
2233 if (U == NULL)
2234 U = mzd_submatrix(NULL, A, 0, 0, k, k);
2235 else
2236 assert(U->nrows == k && U->ncols == k);
2237 for(rci_t i=1; i<U->nrows; i++) {
2238 for(wi_t j=0; j<i/m4ri_radix; j++) {
2239 U->rows[i][j] = 0;
2240 }
2241 if(i%m4ri_radix)
2242 mzd_clear_bits(U, i, (i/m4ri_radix)*m4ri_radix, i%m4ri_radix);
2243 }
2244 return U;
2245 }
2246
mzd_extract_l(mzd_t * L,mzd_t const * A)2247 mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A) {
2248 rci_t k = MIN(A->nrows, A->ncols);
2249 if (L == NULL)
2250 L = mzd_submatrix(NULL, A, 0, 0, k, k);
2251 else
2252 assert(L->nrows == k && L->ncols == k);
2253 for(rci_t i=0; i<L->nrows-1; i++) {
2254 if(m4ri_radix - (i+1)%m4ri_radix)
2255 mzd_clear_bits(L, i, i+1, m4ri_radix - (i+1)%m4ri_radix);
2256 for(wi_t j=(i/m4ri_radix+1); j<L->width; j++) {
2257 L->rows[i][j] = 0;
2258 }
2259 }
2260 return L;
2261 }
2262