1 /******************************************************************************
2 *
3 *            M4RI: Linear Algebra over GF(2)
4 *
5 *    Copyright (C) 2007 Gregory Bard <gregory.bard@ieee.org>
6 *    Copyright (C) 2009-2013 Martin Albrecht <martinralbrecht+m4ri@googlemail.com>
7 *    Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
8 *
9 *  Distributed under the terms of the GNU General Public License (GPL)
10 *  version 2 or higher.
11 *
12 *    This code is distributed in the hope that it will be useful,
13 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 *    General Public License for more details.
16 *
17 *  The full text of the GPL is available at:
18 *
19 *                  http://www.gnu.org/licenses/
20 ******************************************************************************/
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
25 
26 #ifdef __M4RI_HAVE_LIBPNG
27 #include <png.h>
28 #endif
29 
30 #include <stdlib.h>
31 #include <string.h>
32 #include "mzd.h"
33 #include "parity.h"
34 #include "mmc.h"
35 
36 
37 /**
38  * \brief Cache of mzd_t containers
39  */
40 
41 typedef struct mzd_t_cache {
42   mzd_t mzd[64]; /*!< cached matrices */
43   struct mzd_t_cache *prev; /*!< previous block */
44   struct mzd_t_cache *next; /*!< next block */
45   uint64_t used; /*!< bitmasks which matrices in this block are used */
46   unsigned char padding[sizeof(mzd_t) - 2 * sizeof(struct mzd_t_cache*) - sizeof(uint64_t)]; /*!< alignment */
47 #ifdef __GNUC__
48 } mzd_t_cache_t __attribute__ ((__aligned__ (64)));
49 #else
50 } mzd_t_cache_t;
51 #endif
52 
53 #define __M4RI_MZD_T_CACHE_MAX 16
54 static mzd_t_cache_t mzd_cache;
55 static mzd_t_cache_t* current_cache = &mzd_cache;
56 
log2_floor(uint64_t v)57 static int log2_floor(uint64_t v) {
58   static uint64_t const b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000 };
59   static unsigned int const S[] = { 1, 2, 4, 8, 16, 32 };
60   unsigned int r = 0;
61   for (int i = 5; i >= 0; --i)
62   {
63     if ((v & b[i]))
64     {
65       v >>= S[i];
66       r |= S[i];
67     }
68   }
69   return r;
70 }
71 
72 /*
73  * Return a pointer to a new mzd_t structure.
74  * The structure will be 64 byte aligned.
75  * Call mzd_t_free to free the structure for next use.
76  */
77 
mzd_t_malloc()78 static mzd_t* mzd_t_malloc() {
79 #if __M4RI_ENABLE_MZD_CACHE == 0
80   return (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
81 #else
82   mzd_t *ret = NULL;
83   int i=0;
84 
85   if (current_cache->used == (uint64_t)-1) {
86     mzd_t_cache_t *cache = &mzd_cache;
87     while (cache && cache->used == (uint64_t)-1) {
88       current_cache = cache;
89       cache = cache->next;
90       i++;
91     }
92     if (!cache && i< __M4RI_MZD_T_CACHE_MAX) {
93       cache = (mzd_t_cache_t*)m4ri_mm_malloc_aligned(sizeof(mzd_t_cache_t), 64);
94       memset((char*)cache, 0, sizeof(mzd_t_cache_t));
95 
96       cache->prev = current_cache;
97       current_cache->next = cache;
98       current_cache = cache;
99     } else if (!cache && i>= __M4RI_MZD_T_CACHE_MAX) {
100       /* We have reached the upper limit on the number of caches */
101       ret = (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
102     } else {
103       current_cache = cache;
104     }
105   }
106   if (ret == NULL) {
107     int free_entry =log2_floor(~current_cache->used);
108     current_cache->used |= ((uint64_t)1 << free_entry);
109     ret = &current_cache->mzd[free_entry];
110   }
111   return ret;
112 #endif //__M4RI_ENABLE_MZD_CACHE
113 }
114 
mzd_t_free(mzd_t * M)115 static void mzd_t_free(mzd_t *M) {
116 #if __M4RI_ENABLE_MZD_CACHE == 0
117   m4ri_mm_free(M);
118 #else
119   int foundit = 0;
120   mzd_t_cache_t *cache = &mzd_cache;
121   while(cache) {
122     size_t entry = M - cache->mzd;
123     if (entry < 64) {
124       cache->used &= ~((uint64_t)1 << entry);
125       if (cache->used == 0) {
126         if (cache == &mzd_cache) {
127           current_cache = cache;
128         } else {
129           if (cache == current_cache) {
130             current_cache = cache->prev;
131           }
132           cache->prev->next = cache->next;
133           if (cache->next)
134             cache->next->prev = cache->prev;
135           m4ri_mm_free(cache);
136         }
137       }
138       foundit = 1;
139       break;
140     }
141     cache = cache->next;
142   }
143   if(!foundit) {
144     m4ri_mm_free(M);
145   }
146 #endif //__M4RI_ENABLE_MZD_CACHE
147 }
148 
mzd_init(rci_t r,rci_t c)149 mzd_t *mzd_init(rci_t r, rci_t c) {
150   assert(sizeof(mzd_t) == 64);
151 
152   mzd_t *A = mzd_t_malloc();
153 
154   A->nrows = r;
155   A->ncols = c;
156   A->width = (c + m4ri_radix - 1) / m4ri_radix;
157   A->rowstride = (A->width < mzd_paddingwidth || (A->width & 1) == 0) ? A->width : A->width + 1;
158   A->high_bitmask = __M4RI_LEFT_BITMASK(c % m4ri_radix);
159   A->flags = (A->high_bitmask != m4ri_ffff) ? mzd_flag_nonzero_excess : 0;
160   A->offset_vector = 0;
161   A->row_offset = 0;
162 
163   A->rows = (word**)m4ri_mmc_calloc(r + 1, sizeof(word*)); // We're overcomitting here.
164 
165   if (r && c) {
166     int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride;
167     A->blockrows_log = 0;
168     while(blockrows >>= 1)
169       A->blockrows_log++;
170     blockrows = 1 << A->blockrows_log;
171 
172     int const blockrows_mask = blockrows - 1;
173     int const nblocks = (r + blockrows - 1) / blockrows;
174     A->flags |= (nblocks > 1) ? mzd_flag_multiple_blocks : 0;
175     A->blocks = (mzd_block_t*)m4ri_mmc_calloc(nblocks + 1, sizeof(mzd_block_t));
176 
177     size_t block_words = (r - (nblocks - 1) * blockrows) * A->rowstride;
178     for(int i = nblocks - 1; i >= 0; --i) {
179       A->blocks[i].size = block_words * sizeof(word);
180       A->blocks[i].begin = (word*)m4ri_mmc_calloc(1, A->blocks[i].size);
181       A->blocks[i].end = A->blocks[i].begin + block_words;
182       block_words = blockrows * A->rowstride;
183     }
184 
185     for(rci_t i = 0; i < A->nrows; ++i) {
186       A->rows[i] = A->blocks[i >> A->blockrows_log].begin + (i & blockrows_mask) * A->rowstride;
187     }
188 
189   } else {
190     A->blocks = NULL;
191   }
192 
193   return A;
194 }
195 
196 /*
197    Explanation of offset_vector (in words), and row_offset.
198 
199    <------------------------------- row_stride (in words)--------------------->
200    .---------------------------------------------------------------------------.  <-- m->blocks[0].begin   ^
201    |                                 ^                                        /|                           |
202    |                    m->row_offset|                      m->offset_vector_/ |                           |
203    |                                 v                                      /  |                           |
204    |  .--------------------------------------------------------------------v<--|---- m->rows[0]            |_ skipped_blocks (in blocks)
205    |  |m (also a window)             ^                                     |   |                           |
206    |  |                              |                                     |   |                           |
207    `---------------------------------|-----------------------------------------'                           v
208    .---------------------------------|----------------------------------------_.  <-- m->blocks[1].begin <-- windows.blocks[0].begin
209    |  |                      ^   lowr|                                     |_^ |
210    |  |    window->row_offset|       |            window->offset_vector _-^|   |
211    |  |                      v       v                               _-^   |   |
212    |  |  .----------------------------------------------------------v<--.  |<--|---- m->rows[lowr]
213    |  |  |window                                                    |    `-|---|---- window->rows[0]
214    |  |  |                                                          |      |   |
215    `---------------------------------------------------------------------------'
216    .---------------------------------------------------------------------------.  <-- m->blocks[2].begin <-- windows.blocks[1].begin
217    |  |  |                                                          |      |   |
218    |  |  |                                                          | lowc |   |
219    |  |  |                                                          |<---->|   |
220    |  |  |                                                          |   \__|___|__ also wrd_offset (in words)
221    |  |  `----------------------------------------------------------'      |   |
222    |  `--------------------------------------------------------------------'   |
223    `---------------------------------------------------------------------------'
224    .---------------------------------------------------------------------------.
225    |                                                                           |
226 
227 */
228 
mzd_init_window(mzd_t * M,const rci_t lowr,const rci_t lowc,const rci_t highr,const rci_t highc)229 mzd_t *mzd_init_window(mzd_t *M, const rci_t lowr, const rci_t lowc, const rci_t highr, const rci_t highc) {
230   assert(lowc % m4ri_radix == 0);
231 
232   mzd_t *W = mzd_t_malloc();
233 
234   rci_t nrows = MIN(highr - lowr, M->nrows - lowr);
235   rci_t ncols = highc - lowc;
236 
237   W->nrows = nrows;
238   W->ncols = ncols;
239   W->rowstride = M->rowstride;
240   W->width = (ncols + m4ri_radix - 1) / m4ri_radix;
241   W->high_bitmask = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
242 
243   W->flags = mzd_flag_windowed_zerooffset;
244   W->flags |= (ncols % m4ri_radix == 0) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess;
245   W->blockrows_log = M->blockrows_log;
246 
247   wi_t const blockrows_mask = (1 << W->blockrows_log) - 1;
248   int const skipped_blocks = (M->row_offset + lowr) >> W->blockrows_log;
249   assert(skipped_blocks == 0 || ((M->flags & mzd_flag_multiple_blocks)));
250   W->row_offset = (M->row_offset + lowr) & blockrows_mask;
251   W->blocks = &M->blocks[skipped_blocks];
252   wi_t const wrd_offset = lowc / m4ri_radix;
253   W->offset_vector = (M->offset_vector + wrd_offset) + (W->row_offset - M->row_offset) * W->rowstride;
254   if(nrows)
255     W->rows = (word**)m4ri_mmc_calloc(nrows + 1, sizeof(word*));
256   else
257     W->rows = NULL;
258   for(rci_t i = 0; i < nrows; ++i) {
259     W->rows[i] = M->rows[lowr + i] + wrd_offset;
260   }
261   if (mzd_row_to_block(W, nrows - 1) > 0)
262     W->flags |= M->flags & mzd_flag_multiple_blocks;
263 
264   /* offset_vector is the distance from the start of the first block to the first word of the first row. */
265   assert(nrows == 0 || W->blocks[0].begin + W->offset_vector == W->rows[0]);
266 
267   __M4RI_DD_MZD(W);
268   return W;
269 }
270 
mzd_free(mzd_t * A)271 void mzd_free(mzd_t *A) {
272   if(A->rows)
273     m4ri_mmc_free(A->rows, (A->nrows + 1) * sizeof(word*));
274   if(mzd_owns_blocks(A)) {
275     int i;
276     for(i = 0; A->blocks[i].size; ++i) {
277       m4ri_mmc_free(A->blocks[i].begin, A->blocks[i].size);
278     }
279     m4ri_mmc_free(A->blocks, (i + 1) * sizeof(mzd_block_t));
280   }
281   mzd_t_free(A);
282 }
283 
mzd_row_add(mzd_t * M,rci_t sourcerow,rci_t destrow)284 void mzd_row_add(mzd_t *M, rci_t sourcerow, rci_t destrow) {
285   mzd_row_add_offset(M, destrow, sourcerow, 0);
286 }
287 
mzd_row_clear_offset(mzd_t * M,rci_t row,rci_t coloffset)288 void mzd_row_clear_offset(mzd_t *M, rci_t row, rci_t coloffset) {
289   wi_t const startblock = coloffset / m4ri_radix;
290   word temp;
291 
292   /* make sure to start clearing at coloffset */
293   if (coloffset%m4ri_radix) {
294     temp = M->rows[row][startblock];
295     temp &= __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset);
296   } else {
297     temp = 0;
298   }
299   M->rows[row][startblock] = temp;
300   for (wi_t i = startblock + 1; i < M->width; ++i) {
301     M->rows[row][i] = 0;
302   }
303 
304   __M4RI_DD_ROW(M, row);
305 }
306 
307 
mzd_gauss_delayed(mzd_t * M,rci_t startcol,int full)308 rci_t mzd_gauss_delayed(mzd_t *M, rci_t startcol, int full) {
309   rci_t startrow = startcol;
310   rci_t pivots = 0;
311   for (rci_t i = startcol; i < M->ncols ; ++i) {
312     for(rci_t j = startrow ; j < M->nrows; ++j) {
313       if (mzd_read_bit(M, j, i)) {
314 	mzd_row_swap(M, startrow, j);
315 	++pivots;
316 
317 	for(rci_t ii = full ? 0 : startrow + 1;  ii < M->nrows; ++ii) {
318 	  if (ii != startrow) {
319 	    if (mzd_read_bit(M, ii, i)) {
320 	      mzd_row_add_offset(M, ii, startrow, i);
321 	    }
322 	  }
323 	}
324 	startrow = startrow + 1;
325 	break;
326       }
327     }
328   }
329 
330   __M4RI_DD_MZD(M);
331   __M4RI_DD_RCI(pivots);
332   return pivots;
333 }
334 
mzd_echelonize_naive(mzd_t * M,int full)335 rci_t mzd_echelonize_naive(mzd_t *M, int full) {
336   return mzd_gauss_delayed(M, 0, full);
337 }
338 
339 /**
340  * Transpose a 64 x 64 matrix with width 1.
341  *
342  * \param dst First word of destination matrix.
343  * \param src First word of source matrix.
344  * \param rowstride_dst Rowstride of matrix dst.
345  * \param rowstride_src Rowstride of matrix src.
346  *
347  * Rows of both matrices are expected to fit exactly in a word (offset == 0)
348  * and lay entirely inside a single block.
349  *
350  * \note This function also works when dst == src.
351  */
352 
_mzd_copy_transpose_64x64(word * dst,word const * src,wi_t rowstride_dst,wi_t rowstride_src)353 static inline void _mzd_copy_transpose_64x64(word* dst, word const* src, wi_t rowstride_dst, wi_t rowstride_src)
354 {
355   /*
356    * m runs over the values:
357    *   0x00000000FFFFFFFF
358    *   0x0000FFFF0000FFFF
359    *   0x00FF00FF00FF00FF
360    *   0x0F0F0F0F0F0F0F0F
361    *   0x3333333333333333
362    *   0x5555555555555555,
363    * alternating j zeroes with j ones.
364    *
365    * Assume we have a matrix existing of four jxj matrices ((0,0) is in the top-right corner,
366    * this is the memory-model view, see the layout on http://m4ri.sagemath.org/doxygen/structmzd__t.html):
367    * ...[A1][B1][A0][B0]
368    * ...[C1][D1][C0][D0]
369    *          . [A2][B2]
370    *        .   [C2][B2]
371    *      .         .
372    *                .
373    * The following calulates the XOR between A and D,
374    * and subsequently applies that to A and D respectively,
375    * swapping A and D as a result.
376    * Therefore wk starts at the first row and then has rowstride
377    * added j times, running over the rows of A, then skips C
378    * by adding j * rowstride to continue with the next A below C.
379    */
380 
381   word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
382   wi_t j_rowstride_dst = rowstride_dst * 64;
383   wi_t j_rowstride_src = rowstride_src * 32;
384   word* const end = dst + j_rowstride_dst;
385   // We start with j = 32, and a one-time unrolled loop, where
386   // we copy from src and write the result to dst, swapping
387   // the two 32x32 corner matrices.
388   int j = 32;
389   j_rowstride_dst >>= 1;
390   word* RESTRICT wk = dst;
391   for (word const* RESTRICT wks = src; wk < end; wk += j_rowstride_dst, wks += j_rowstride_src) {
392     for (int k = 0; k < j; ++k, wk += rowstride_dst, wks += rowstride_src) {
393       word xor = ((*wks >> j) ^ *(wks + j_rowstride_src)) & m;
394       *wk = *wks ^ (xor << j);
395       *(wk + j_rowstride_dst) = *(wks + j_rowstride_src) ^ xor;
396     }
397   }
398   // Next we work in-place in dst and swap the corners of
399   // each of the last matrices, all in parallel, for all
400   // remaining values of j.
401   m ^= m << 16;
402   for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
403     j_rowstride_dst >>= 1;
404     for (wk = dst; wk < end; wk += j_rowstride_dst) {
405       for (int k = 0; k < j; ++k, wk += rowstride_dst) {
406 	word xor = ((*wk >> j) ^ *(wk + j_rowstride_dst)) & m;
407         *wk ^= xor << j;
408 	*(wk + j_rowstride_dst) ^= xor;
409       }
410     }
411   }
412 }
413 
414 /**
415  * Transpose two 64 x 64 matrix with width 1.
416  *
417  * \param dst1 First word of destination matrix 1.
418  * \param dst2 First word of destination matrix 2.
419  * \param src1 First word of source matrix 1.
420  * \param src2 First word of source matrix 2.
421  * \param rowstride_dst Rowstride of destination matrices.
422  * \param rowstride_src Rowstride of source matrices.
423  *
424  * Rows of all matrices are expected to fit exactly in a word (offset == 0)
425  * and lay entirely inside a single block.
426  *
427  * \note This function also works to transpose in-place.
428  */
429 
_mzd_copy_transpose_64x64_2(word * RESTRICT dst1,word * RESTRICT dst2,word const * RESTRICT src1,word const * RESTRICT src2,wi_t rowstride_dst,wi_t rowstride_src)430 static inline void _mzd_copy_transpose_64x64_2(word* RESTRICT dst1, word* RESTRICT dst2, word const* RESTRICT src1, word const* RESTRICT src2, wi_t rowstride_dst, wi_t rowstride_src)
431 {
432   word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
433   wi_t j_rowstride_dst = rowstride_dst * 64;
434   wi_t j_rowstride_src = rowstride_src * 32;
435   word* const end = dst1 + j_rowstride_dst;
436   int j = 32;
437   word* RESTRICT wk[2];
438   word const* RESTRICT wks[2];
439   word xor[2];
440 
441   j_rowstride_dst >>= 1;
442   wk[0] = dst1;
443   wk[1] = dst2;
444   wks[0] = src1;
445   wks[1] = src2;
446 
447   do {
448 
449     for (int k = 0; k < j; ++k) {
450       xor[0] = ((*wks[0] >> j) ^ *(wks[0] + j_rowstride_src)) & m;
451       xor[1] = ((*wks[1] >> j) ^ *(wks[1] + j_rowstride_src)) & m;
452       *wk[0] = *wks[0] ^ (xor[0] << j);
453       *wk[1] = *wks[1] ^ (xor[1] << j);
454       *(wk[0] + j_rowstride_dst) = *(wks[0] + j_rowstride_src) ^ xor[0];
455       *(wk[1] + j_rowstride_dst) = *(wks[1] + j_rowstride_src) ^ xor[1];
456       wk[0] += rowstride_dst;
457       wk[1] += rowstride_dst;
458       wks[0] += rowstride_src;
459       wks[1] += rowstride_src;
460     }
461 
462     wk[0] += j_rowstride_dst;
463     wk[1] += j_rowstride_dst;
464     wks[0] += j_rowstride_src;
465     wks[1] += j_rowstride_src;
466 
467   } while(wk[0] < end);
468 
469   m ^= m << 16;
470   for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
471 
472     j_rowstride_dst >>= 1;
473     wk[0] = dst1;
474     wk[1] = dst2;
475 
476     do {
477 
478       for (int k = 0; k < j; ++k) {
479 	xor[0] = ((*wk[0] >> j) ^ *(wk[0] + j_rowstride_dst)) & m;
480 	xor[1] = ((*wk[1] >> j) ^ *(wk[1] + j_rowstride_dst)) & m;
481 	*wk[0] ^= xor[0] << j;
482 	*wk[1] ^= xor[1] << j;
483 	*(wk[0] + j_rowstride_dst) ^= xor[0];
484 	*(wk[1] + j_rowstride_dst) ^= xor[1];
485 	wk[0] += rowstride_dst;
486 	wk[1] += rowstride_dst;
487       }
488 
489       wk[0] += j_rowstride_dst;
490       wk[1] += j_rowstride_dst;
491 
492     } while(wk[0] < end);
493   }
494 }
495 
496 static unsigned char log2_ceil_table[64] = {
497   0, 1, 2, 2, 3, 3, 3, 3,
498   4, 4, 4, 4, 4, 4, 4, 4,
499   5, 5, 5, 5, 5, 5, 5, 5,
500   5, 5, 5, 5, 5, 5, 5, 5,
501   6, 6, 6, 6, 6, 6, 6, 6,
502   6, 6, 6, 6, 6, 6, 6, 6,
503   6, 6, 6, 6, 6, 6, 6, 6,
504   6, 6, 6, 6, 6, 6, 6, 6
505 };
506 
log2_ceil(int n)507 static inline int log2_ceil(int n)
508 {
509   return log2_ceil_table[n - 1];
510 }
511 
512 static word const transpose_mask[6] = {
513   0x5555555555555555ULL,
514   0x3333333333333333ULL,
515   0x0F0F0F0F0F0F0F0FULL,
516   0x00FF00FF00FF00FFULL,
517   0x0000FFFF0000FFFFULL,
518   0x00000000FFFFFFFFULL,
519 };
520 
521 /**
522  * Transpose 64/j matrices of size jxj in parallel.
523  *
524  * Where j equals n rounded up to the nearest power of 2.
525  * The input array t must be of size j (containing the rows i of all matrices in t[i]).
526  *
527  * t[0..{j-1}]  = [Al]...[A1][A0]
528  *
529  * \param t An array of j words.
530  * \param n The number of rows in each matrix.
531  *
532  * \return log2(j)
533  */
534 
_mzd_transpose_Nxjx64(word * RESTRICT t,int n)535 static inline int _mzd_transpose_Nxjx64(word* RESTRICT t, int n)
536 {
537   int j = 1;
538   int mi = 0;		// Index into the transpose_mask array.
539 
540   while (j < n)		// Don't swap with entirely undefined data (where [D] exists entirely of non-existant rows).
541   {
542     // Swap 64/j matrices of size jxj in 2j rows. Thus,
543     // <---- one word --->
544     // [Al][Bl]...[A0][B0]
545     // [Cl][Dl]...[C0][D0], where l = 64/j - 1 and each matrix [A], [B] etc is jxj.
546     // Then swap [A] and [D] in-place.
547 
548     // m runs over the values in transpose_mask, so that at all
549     // times m exists of j zeroes followed by j ones, repeated.
550     word const m = transpose_mask[mi];
551     int k = 0;		// Index into t[].
552     do {
553       // Run over all rows of [A] and [D].
554       for (int i = 0; i < j; ++i, ++k) {
555 	// t[k] contains row i of all [A], and t[k + j] contains row i of all [D]. Swap them.
556 	word xor = ((t[k] >> j) ^ t[k + j]) & m;
557 	t[k] ^= xor << j;
558 	t[k + j] ^= xor;
559       }
560       k += j;		// Skip [C].
561     } while (k < n);	// Stop if we passed all valid input.
562 
563     // Double the size of j and repeat this for the next 2j rows until all
564     // n rows have been swapped (possibly with non-existant rows).
565     j <<= 1;
566     ++mi;
567   }
568 
569   return mi;
570 }
571 
572 /**
573  * Transpose a n x 64 matrix with width 1.
574  *
575  * \param dst First word of destination matrix.
576  * \param src First word of source matrix.
577  * \param rowstride_dst Rowstride of destination matrix.
578  * \param rowstride_src Rowstride of source matrix.
579  * \param n Number of rows in source matrix, must be less than 64.
580  *
581  * Rows of all matrices are expected have offset zero
582  * and lay entirely inside a single block.
583  *
584  * \note This function also works to transpose in-place.
585  */
586 
_mzd_copy_transpose_lt64x64(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n)587 static inline void _mzd_copy_transpose_lt64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
588 {
589   // Preload the n input rows into level 1, using a minimum of cache lines (compact storage).
590   word t[64];
591   word const* RESTRICT wks = src;
592   int k;
593   for (k = 0; k < n; ++k) {
594     t[k] = *wks;
595     wks += rowstride_src;
596   }
597   // see https://bitbucket.org/malb/m4ri/issues/53
598   for (; k<64; ++k) {
599     t[k] = 0;
600   }
601   if (n > 32) {
602     while (k < 64)
603       t[k++] = 0;
604     _mzd_copy_transpose_64x64(dst, t, rowstride_dst, 1);
605     return;
606   }
607   int log2j = _mzd_transpose_Nxjx64(t, n);
608   // All output bits are now transposed, but still might need to be shifted in place.
609   // What we have now is 64/j matrices of size jxj. Thus,
610   // [Al]...[A1][A0], where l = 64/j - 1.
611   // while the actual output is:
612   // [A0]
613   // [A1]
614   // ...
615   // [Al]
616   word const m = __M4RI_LEFT_BITMASK(n);
617   word* RESTRICT wk = dst;
618   switch (log2j) {
619     case 5:
620     {
621       wi_t const j_rowstride_dst = 32 * rowstride_dst;
622       for (int k = 0; k < 32; ++k) {
623 	wk[0] = t[k] & m;
624 	wk[j_rowstride_dst] = (t[k] >> 32) & m;
625 	wk += rowstride_dst;
626       }
627       break;
628     }
629     case 4:
630     {
631       wi_t const j_rowstride_dst = 16 * rowstride_dst;
632       for (int k = 0; k < 16; ++k) {
633 	wk[0] = t[k] & m;
634 	wk[j_rowstride_dst] = (t[k] >> 16) & m;
635 	wk[2 * j_rowstride_dst] = (t[k] >> 32) & m;
636 	wk[3 * j_rowstride_dst] = (t[k] >> 48) & m;
637 	wk += rowstride_dst;
638       }
639       break;
640     }
641     case 3:
642     {
643       wi_t const j_rowstride_dst = 8 * rowstride_dst;
644       for (int k = 0; k < 8; ++k) {
645 	wk[0] = t[k] & m;
646 	wk[j_rowstride_dst] = (t[k] >> 8) & m;
647 	wk[2 * j_rowstride_dst] = (t[k] >> 16) & m;
648 	wk[3 * j_rowstride_dst] = (t[k] >> 24) & m;
649 	wk[4 * j_rowstride_dst] = (t[k] >> 32) & m;
650 	wk[5 * j_rowstride_dst] = (t[k] >> 40) & m;
651 	wk[6 * j_rowstride_dst] = (t[k] >> 48) & m;
652 	wk[7 * j_rowstride_dst] = (t[k] >> 56) & m;
653 	wk += rowstride_dst;
654       }
655       break;
656     }
657     case 2:
658     {
659       wi_t const j_rowstride_dst = 4 * rowstride_dst;
660       for (int k = 0; k < 4; ++k) {
661 	word* RESTRICT wk2 = wk;
662 	word tk = t[k];
663 	for (int i = 0; i < 2; ++i) {
664 	  wk2[0] = tk & m;
665 	  wk2[j_rowstride_dst] = (tk >> 4) & m;
666 	  wk2[2 * j_rowstride_dst] = (tk >> 8) & m;
667 	  wk2[3 * j_rowstride_dst] = (tk >> 12) & m;
668 	  wk2[4 * j_rowstride_dst] = (tk >> 16) & m;
669 	  wk2[5 * j_rowstride_dst] = (tk >> 20) & m;
670 	  wk2[6 * j_rowstride_dst] = (tk >> 24) & m;
671 	  wk2[7 * j_rowstride_dst] = (tk >> 28) & m;
672 	  wk2 += 8 * j_rowstride_dst;
673 	  tk >>= 32;
674 	}
675 	wk += rowstride_dst;
676       }
677       break;
678     }
679     case 1:
680     {
681       wi_t const j_rowstride_dst = 2 * rowstride_dst;
682       for (int k = 0; k < 2; ++k) {
683 	word* RESTRICT wk2 = wk;
684 	word tk = t[k];
685 	for (int i = 0; i < 8; ++i) {
686 	  wk2[0] = tk & m;
687 	  wk2[j_rowstride_dst] = (tk >> 2) & m;
688 	  wk2[2 * j_rowstride_dst] = (tk >> 4) & m;
689 	  wk2[3 * j_rowstride_dst] = (tk >> 6) & m;
690 	  wk2 += 4 * j_rowstride_dst;
691 	  tk >>= 8;
692 	}
693 	wk += rowstride_dst;
694       }
695       break;
696     }
697     case 0:
698     {
699       word* RESTRICT wk2 = wk;
700       word tk = t[0];
701       for (int i = 0; i < 16; ++i) {
702 	wk2[0] = tk & m;
703 	wk2[rowstride_dst] = (tk >> 1) & m;
704 	wk2[2 * rowstride_dst] = (tk >> 2) & m;
705 	wk2[3 * rowstride_dst] = (tk >> 3) & m;
706 	wk2 += 4 * rowstride_dst;
707 	tk >>= 4;
708       }
709       break;
710     }
711   }
712 }
713 
714 /**
715  * Transpose a 64 x n matrix with width 1.
716  *
717  * \param dst First word of destination matrix.
718  * \param src First word of source matrix.
719  * \param rowstride_dst Rowstride of destination matrix.
720  * \param rowstride_src Rowstride of source matrix.
721  * \param n Number of columns in source matrix, must be less than 64.
722  *
723  * Rows of all matrices are expected have offset zero
724  * and lay entirely inside a single block.
725  *
726  * \note This function also works to transpose in-place.
727  */
728 
_mzd_copy_transpose_64xlt64(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n)729 static inline void _mzd_copy_transpose_64xlt64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
730 {
731   word t[64];
732   int log2j = log2_ceil(n);
733   word const* RESTRICT wks = src;
734   switch (log2j) {
735     case 6:
736     {
737       _mzd_copy_transpose_64x64(t, src, 1, rowstride_src);
738       word* RESTRICT wk = dst;
739       for (int k = 0; k < n; ++k) {
740 	*wk = t[k];
741 	wk += rowstride_dst;
742       }
743       return;
744     }
745     case 5:
746     {
747       wi_t const j_rowstride_src = 32 * rowstride_src;
748       for (int k = 0; k < 32; ++k) {
749 	t[k] = wks[0] | (wks[j_rowstride_src] << 32);
750 	wks += rowstride_src;
751       }
752       break;
753     }
754     case 4:
755     {
756       wi_t const j_rowstride_src = 16 * rowstride_src;
757       for (int k = 0; k < 16; ++k) {
758 	t[k] = wks[0] | (wks[j_rowstride_src] << 16);
759 	t[k] |= (wks[2 * j_rowstride_src] << 32) | (wks[3 * j_rowstride_src] << 48);
760 	wks += rowstride_src;
761       }
762       break;
763     }
764     case 3:
765     {
766       wi_t const j_rowstride_src = 8 * rowstride_src;
767       word tt;
768       for (int k = 0; k < 8; ++k) {
769 	tt = wks[0] | (wks[j_rowstride_src] << 8);
770 	t[k] = (wks[2 * j_rowstride_src] << 16) | (wks[3 * j_rowstride_src] << 24);
771 	tt |= (wks[4 * j_rowstride_src] << 32) | (wks[5 * j_rowstride_src] << 40);
772 	t[k] |= (wks[6 * j_rowstride_src] << 48) | (wks[7 * j_rowstride_src] << 56);
773 	wks += rowstride_src;
774 	t[k] |= tt;
775       }
776       break;
777     }
778     case 2:
779     {
780       word const* RESTRICT wks2 = wks + 60 * rowstride_src;
781       t[0] = wks2[0];
782       t[1] = wks2[rowstride_src];
783       t[2] = wks2[2 * rowstride_src];
784       t[3] = wks2[3 * rowstride_src];
785       for (int i = 0; i < 15; ++i) {
786 	wks2 -= 4 * rowstride_src;
787 	t[0] <<= 4;
788 	t[1] <<= 4;
789 	t[2] <<= 4;
790 	t[3] <<= 4;
791 	t[0] |= wks2[0];
792 	t[1] |= wks2[rowstride_src];
793 	t[2] |= wks2[2 * rowstride_src];
794 	t[3] |= wks2[3 * rowstride_src];
795       }
796       break;
797     }
798     case 1:
799     {
800       wks += 62 * rowstride_src;
801       t[0] = wks[0];
802       t[1] = wks[rowstride_src];
803       for (int i = 0; i < 31; ++i) {
804 	wks -= 2 * rowstride_src;
805 	t[0] <<= 2;
806 	t[1] <<= 2;
807 	t[0] |= wks[0];
808 	t[1] |= wks[rowstride_src];
809       }
810       break;
811     }
812     case 0:
813     {
814       word tt[2];
815       tt[0] = wks[0];
816       tt[1] = wks[rowstride_src];
817       for (int i = 2; i < 64; i += 2) {
818 	wks += 2 * rowstride_src;
819 	tt[0] |= wks[0] << i;
820 	tt[1] |= wks[rowstride_src] << i;
821       }
822       *dst = tt[0] | (tt[1] << 1);
823       return;
824     }
825   }
826   int j  = 1 << log2j;
827   _mzd_transpose_Nxjx64(t, j);
828   word* RESTRICT wk = dst;
829   for (int k = 0; k < n; ++k) {
830     *wk = t[k];
831     wk += rowstride_dst;
832   }
833 }
834 
835 /**
836  * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 8.
837  *
838  * \param dst First word of destination matrix.
839  * \param src First word of source matrix.
840  * \param rowstride_dst Rowstride of destination matrix.
841  * \param rowstride_src Rowstride of source matrix.
842  * \param n Number of rows in source matrix, must be less than or equal 8.
843  * \param m Number of columns in source matrix, must be less than or equal 8.
844  *
845  * Rows of all matrices are expected to have offset zero
846  * and lay entirely inside a single block.
847  *
848  * \note This function also works to transpose in-place.
849  */
850 
_mzd_copy_transpose_le8xle8(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n,int m,int maxsize)851 static inline void _mzd_copy_transpose_le8xle8(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
852 {
853   int end = maxsize * 7;
854   word const* RESTRICT wks = src;
855   word w = *wks;
856   int shift = 0;
857   for (int i = 1; i < n; ++i) {
858     wks += rowstride_src;
859     shift += 8;
860     w |= (*wks << shift);
861   }
862   word mask = 0x80402010080402ULL;
863   word w7 = w >> 7;
864   shift = 7;
865   --m;
866   do {
867     word xor = (w ^ w7) & mask;
868     mask >>= 8;
869     w ^= (xor << shift);
870     shift += 7;
871     w7 >>= 7;
872     w ^= xor;
873   } while(shift < end);
874   word* RESTRICT wk = dst + m * rowstride_dst;
875   for (int shift = 8 * m; shift > 0; shift -= 8) {
876     *wk = (unsigned char)(w >> shift);
877     wk -= rowstride_dst;
878   }
879   *wk = (unsigned char)w;
880 }
881 
882 /**
883  * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 16.
884  *
885  * \param dst First word of destination matrix.
886  * \param src First word of source matrix.
887  * \param rowstride_dst Rowstride of destination matrix.
888  * \param rowstride_src Rowstride of source matrix.
889  * \param n Number of rows in source matrix, must be less than or equal 16.
890  * \param m Number of columns in source matrix, must be less than or equal 16.
891  *
892  * Rows of all matrices are expected to have offset zero
893  * and lay entirely inside a single block.
894  *
895  * \note This function also works to transpose in-place.
896  */
897 
_mzd_copy_transpose_le16xle16(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n,int m,int maxsize)898 static inline void _mzd_copy_transpose_le16xle16(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
899 {
900   int end = maxsize * 3;
901   word const* RESTRICT wks = src;
902   word t[4];
903   int i = n;
904   do {
905     t[0] = wks[0];
906     if (--i == 0) {
907       t[1] = 0;
908       t[2] = 0;
909       t[3] = 0;
910       break;
911     }
912     t[1] = wks[rowstride_src];
913     if (--i == 0) {
914       t[2] = 0;
915       t[3] = 0;
916       break;
917     }
918     t[2] = wks[2 * rowstride_src];
919     if (--i == 0) {
920       t[3] = 0;
921       break;
922     }
923     t[3] = wks[3 * rowstride_src];
924     if (--i == 0)
925       break;
926     wks += 4 * rowstride_src;
927     for(int shift = 16;; shift += 16) {
928       t[0] |= (*wks << shift);
929       if (--i == 0)
930 	break;
931       t[1] |= (wks[rowstride_src] << shift);
932       if (--i == 0)
933 	break;
934       t[2] |= (wks[2 * rowstride_src] << shift);
935       if (--i == 0)
936 	break;
937       t[3] |= (wks[3 * rowstride_src] << shift);
938       if (--i == 0)
939 	break;
940       wks += 4 * rowstride_src;
941     }
942   } while(0);
943   word mask = 0xF0000F0000F0ULL;
944   int shift = 12;
945   word xor[4];
946   do {
947     xor[0] = (t[0] ^ (t[0] >> shift)) & mask;
948     xor[1] = (t[1] ^ (t[1] >> shift)) & mask;
949     xor[2] = (t[2] ^ (t[2] >> shift)) & mask;
950     xor[3] = (t[3] ^ (t[3] >> shift)) & mask;
951     mask >>= 16;
952     t[0] ^= (xor[0] << shift);
953     t[1] ^= (xor[1] << shift);
954     t[2] ^= (xor[2] << shift);
955     t[3] ^= (xor[3] << shift);
956     shift += 12;
957     t[0] ^= xor[0];
958     t[1] ^= xor[1];
959     t[2] ^= xor[2];
960     t[3] ^= xor[3];
961   } while(shift < end);
962   _mzd_transpose_Nxjx64(t, 4);
963   i = m;
964   word* RESTRICT wk = dst;
965   do {
966     wk[0] = (uint16_t)t[0];
967     if (--i == 0)
968       break;
969     wk[rowstride_dst] = (uint16_t)t[1];
970     if (--i == 0)
971       break;
972     wk[2 * rowstride_dst] = (uint16_t)t[2];
973     if (--i == 0)
974       break;
975     wk[3 * rowstride_dst] = (uint16_t)t[3];
976     if (--i == 0)
977       break;
978     wk += 4 * rowstride_dst;
979     for(int shift = 16;; shift += 16) {
980       wk[0] = (uint16_t)(t[0] >> shift);
981       if (--i == 0)
982 	break;
983       wk[rowstride_dst] = (uint16_t)(t[1] >> shift);
984       if (--i == 0)
985 	break;
986       wk[2 * rowstride_dst] = (uint16_t)(t[2] >> shift);
987       if (--i == 0)
988 	break;
989       wk[3 * rowstride_dst] = (uint16_t)(t[3] >> shift);
990       if (--i == 0)
991 	break;
992       wk += 4 * rowstride_dst;
993     }
994   } while(0);
995 }
996 
997 /**
998  * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 32.
999  *
1000  * \param dst First word of destination matrix.
1001  * \param src First word of source matrix.
1002  * \param rowstride_dst Rowstride of destination matrix.
1003  * \param rowstride_src Rowstride of source matrix.
1004  * \param n Number of rows in source matrix, must be less than or equal 32.
1005  * \param m Number of columns in source matrix, must be less than or equal 32.
1006  *
1007  * Rows of all matrices are expected to have offset zero
1008  * and lay entirely inside a single block.
1009  *
1010  * \note This function also works to transpose in-place.
1011  */
1012 
_mzd_copy_transpose_le32xle32(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n,int m)1013 static inline void _mzd_copy_transpose_le32xle32(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
1014 {
1015   word const* RESTRICT wks = src;
1016   word t[16];
1017   int i = n;
1018   if (n > 16) {
1019     i -= 16;
1020     for (int j = 0; j < 16; ++j) {
1021       t[j] = *wks;
1022       wks += rowstride_src;
1023     }
1024     int j = 0;
1025     do {
1026       t[j++] |= (*wks << 32);
1027       wks += rowstride_src;
1028     } while(--i);
1029   } else {
1030     int j;
1031     for (j = 0; j < n; ++j) {
1032       t[j] = *wks;
1033       wks += rowstride_src;
1034     }
1035     for (; j < 16; ++j)
1036       t[j] = 0;
1037   }
1038   _mzd_transpose_Nxjx64(t, 16);
1039   int one_more = (m & 1);
1040   word* RESTRICT wk = dst;
1041   if (m > 16) {
1042     m -= 16;
1043     for (int j = 0; j < 16; j += 2) {
1044       *wk = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
1045       wk[rowstride_dst] = (t[j + 1] & 0xFFFF) | ((t[j + 1] >> 16) & 0xFFFF0000);
1046       wk += 2 * rowstride_dst;
1047     }
1048     for (int j = 1; j < m; j += 2) {
1049       *wk = ((t[j - 1] >> 16) & 0xFFFF) | ((t[j - 1] >> 32) & 0xFFFF0000);
1050       wk[rowstride_dst] = ((t[j] >> 16) & 0xFFFF) | ((t[j] >> 32) & 0xFFFF0000);
1051       wk += 2 * rowstride_dst;
1052     }
1053     if (one_more) {
1054       *wk = ((t[m - 1] >> 16) & 0xFFFF) | ((t[m - 1] >> 32) & 0xFFFF0000);
1055     }
1056   } else {
1057     for (int j = 1; j < m; j += 2) {
1058       *wk = (t[j - 1] & 0xFFFF) | ((t[j - 1] >> 16) & 0xFFFF0000);
1059       wk[rowstride_dst] = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
1060       wk += 2 * rowstride_dst;
1061     }
1062     if (one_more) {
1063       *wk = (t[m - 1] & 0xFFFF) | ((t[m - 1] >> 16) & 0xFFFF0000);
1064     }
1065   }
1066 }
1067 
_mzd_copy_transpose_le64xle64(word * RESTRICT dst,word const * RESTRICT src,wi_t rowstride_dst,wi_t rowstride_src,int n,int m)1068 static inline void _mzd_copy_transpose_le64xle64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
1069 {
1070   word const* RESTRICT wks = src;
1071   word t[64];
1072   int k;
1073   for (k = 0; k < n; ++k) {
1074     t[k] = *wks;
1075     wks += rowstride_src;
1076   }
1077   while(k < 64)
1078     t[k++] = 0;
1079   _mzd_copy_transpose_64x64(t, t, 1, 1);
1080   word* RESTRICT wk = dst;
1081   for (int k = 0; k < m; ++k) {
1082     *wk = t[k];
1083     wk += rowstride_dst;
1084   }
1085   return;
1086 }
1087 
1088 void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp);
1089 
_mzd_transpose(mzd_t * DST,mzd_t const * A)1090 mzd_t *_mzd_transpose(mzd_t *DST, mzd_t const *A) {
1091   assert(!mzd_is_windowed(DST) && !mzd_is_windowed(A));
1092   // We assume that there fit at least 64 rows in a block, if
1093   // that is the case then each block will contain a multiple
1094   // of 64 rows, since blockrows is a power of 2.
1095   assert(A->blockrows_log >= 6 && DST->blockrows_log >= 6);
1096 
1097   rci_t nrows = A->nrows;
1098   rci_t ncols = A->ncols;
1099   rci_t maxsize = MAX(nrows, ncols);
1100 
1101   word* RESTRICT fwd = mzd_first_row(DST);
1102   word const* RESTRICT fws = mzd_first_row(A);
1103 
1104   if (maxsize >= 64) {
1105 
1106     // This is the most non-intrusive way to deal with the case of multiple blocks.
1107     // Note that this code is VERY sensitive. ANY change to _mzd_transpose can easily
1108     // reduce the speed for small matrices (up to 64x64) by 5 to 10%.
1109     int const multiple_blocks = (A->flags | DST->flags) & mzd_flag_multiple_blocks;
1110     if (__M4RI_UNLIKELY(multiple_blocks)) {
1111       word* RESTRICT non_register_fwd;
1112       word const* RESTRICT non_register_fws;
1113       rci_t non_register_nrows;
1114       rci_t non_register_ncols;
1115       _mzd_transpose_multiblock(DST, A, &non_register_fwd, &non_register_fws, &non_register_nrows, &non_register_ncols);
1116       fwd = non_register_fwd;
1117       fws = non_register_fws;
1118       nrows = non_register_nrows;
1119       ncols = non_register_ncols;
1120     }
1121 
1122     if (nrows >= 64) {
1123       /*
1124        * This is an interesting #if ...
1125        * I recommend to investigate the number of instructions, and the clocks per instruction,
1126        * as function of various sizes of the matrix (most likely especially the number of columns
1127        * (the size of a row) will have influence; also always use multiples of 64 or even 128),
1128        * for both cases below.
1129        *
1130        * To measure this run for example:
1131        *
1132        * ./bench_mzd -m 10 -x 10 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 32000 32000
1133        * ./bench_mzd -m 10 -x 100 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 128 10240
1134        * etc (increase -x for smaller sizes to get better accuracy).
1135        *
1136        * --Carlo Wood
1137        */
1138 #if 1
1139       int js = ncols & nrows & 64;	// True if the total number of whole 64x64 matrices is odd.
1140       wi_t const rowstride_64_dst = 64 * DST->rowstride;
1141       word* RESTRICT fwd_current = fwd;
1142       word const* RESTRICT fws_current = fws;
1143       if (js) {
1144 	js = 1;
1145 	_mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
1146 	if ((nrows | ncols) == 64) {
1147 	  __M4RI_DD_MZD(DST);
1148 	  return DST;
1149 	}
1150 	fwd_current += rowstride_64_dst;
1151 	++fws_current;
1152       }
1153       rci_t const whole_64cols = ncols / 64;
1154       // The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
1155       // This way it can be inlined without duplicating the amount of code that has to be loaded.
1156       word* RESTRICT fwd_delayed = NULL;
1157       word const* RESTRICT fws_delayed = NULL;
1158       int even = 0;
1159       while (1)
1160       {
1161 	for (int j = js; j < whole_64cols; ++j) {
1162 	  if (!even) {
1163 	    fwd_delayed = fwd_current;
1164 	    fws_delayed = fws_current;
1165 	  } else {
1166 	    _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
1167 	  }
1168 	  fwd_current += rowstride_64_dst;
1169 	  ++fws_current;
1170 	  even = !even;
1171 	}
1172 	nrows -= 64;
1173 	if (ncols % 64) {
1174 	  _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
1175 	}
1176 	fwd += 1;
1177 	fws += 64 * A->rowstride;
1178 	if (nrows < 64)
1179 	  break;
1180 	js = 0;
1181 	fws_current = fws;
1182 	fwd_current = fwd;
1183       }
1184 #else
1185       // The same as the above, but without using _mzd_copy_transpose_64x64_2.
1186       wi_t const rowstride_64_dst = 64 * DST->rowstride;
1187       rci_t const whole_64cols = ncols / 64;
1188       assert(nrows >= 64);
1189       do {
1190 	for (int j = 0; j < whole_64cols; ++j) {
1191 	  _mzd_copy_transpose_64x64(fwd + j * rowstride_64_dst, fws + j, DST->rowstride, A->rowstride);
1192 	}
1193 	nrows -= 64;
1194 	if (ncols % 64) {
1195 	  _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
1196 	}
1197 	fwd += 1;
1198 	fws += 64 * A->rowstride;
1199       } while(nrows >= 64);
1200 #endif
1201     }
1202 
1203     if (nrows == 0) {
1204       __M4RI_DD_MZD(DST);
1205       return DST;
1206     }
1207 
1208     // Transpose the remaining top rows. Now 0 < nrows < 64.
1209 
1210     while (ncols >= 64)
1211     {
1212       _mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrows);
1213       ncols -= 64;
1214       fwd += 64 * DST->rowstride;
1215       fws += 1;
1216     }
1217 
1218     if (ncols == 0) {
1219       __M4RI_DD_MZD(DST);
1220       return DST;
1221     }
1222 
1223     maxsize = MAX(nrows, ncols);
1224   }
1225 
1226   // Transpose the remaining corner. Now both 0 < nrows < 64 and 0 < ncols < 64.
1227 
1228   if (maxsize <= 8) {
1229     _mzd_copy_transpose_le8xle8(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
1230   }
1231   else if (maxsize <= 16) {
1232     _mzd_copy_transpose_le16xle16(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
1233   }
1234   else if (maxsize <= 32) {
1235     _mzd_copy_transpose_le32xle32(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
1236   }
1237   else {
1238     _mzd_copy_transpose_le64xle64(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
1239   }
1240 
1241   __M4RI_DD_MZD(DST);
1242   return DST;
1243 }
1244 
_mzd_transpose_multiblock(mzd_t * DST,mzd_t const * A,word * RESTRICT * fwdp,word const * RESTRICT * fwsp,rci_t * nrowsp,rci_t * ncolsp)1245 void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp) {
1246 
1247   rci_t nrows = A->nrows;
1248   rci_t ncols = A->ncols;
1249 
1250   rci_t blockrows_dst = 1 << DST->blockrows_log;	// The maximum number of rows in a block of DST.
1251   rci_t blockrows_src = 1 << A->blockrows_log;		// The maximum number of rows in a block of A.
1252 
1253   /* We're deviding the source matrix into blocks of multiples of 64x64, such that each
1254    * block fits entirely inside a single memory allocation block, both in the source
1255    * as well as the corresponding destination.
1256    *
1257    *                      <-------------------ncols----------------->
1258    *                                  <---------blockrows_dst------->
1259    * .---------------------------------------------------------------.
1260    * |P      ^  Matrix A:|   .       |Q      .       .       .       |<-^---- A->blocks[0].begin
1261    * |       |           |   .       |       .       .       .       |  |
1262    * |       |           |   .       |       .       .       .       |  |
1263    * |       |           |- - - - - -|- - - - - - - - - - - - - - - -|  |
1264    * |       |           |   .       |       .   ^   .       .       |  |
1265    * |       |           |   .       |       .<64x64>.       .       |  |
1266    * |       |           |   .       |       .   v   .       .       |  |
1267    * |       |           |- - - - - -|- - - - - - - - - - - - - - - -|  |- blockrows_src
1268    * |       |           |   .       |       .       .       .       |  |
1269    * |       |           |   .       |       .       .       .       |  |
1270    * |       |           |   .       |       .       .       .       |  |
1271    * |       |nrows      |- - - - - -|- - - - - - - - - - - - - - - -|  |
1272    * |       |           |   .       |       .       .       .       |  |
1273    * |       |           |   .       |       .       .       .       |  |
1274    * |       |           |   .       |       .       .       .       |  v
1275    * |===================+===========================================|
1276    * |R      |           |   .       |S      .       .       .       |<------ A->blocks[1].begin
1277    * |       |           |   .       |       .       .       .       |
1278    * |       |           |   .       |       .       .       .       |
1279    * |       |           |- - - - - -|- - - - - - - - - - - - - - - -|
1280    * |       |           |   .       |       .       .       .       |
1281    * |       |           |   .       |       .       .       .       |
1282    * |       |           |   .       |       .       .       .       |
1283    * |       |           |- - - - - -|- - - - - - - - - - - - - - - -|
1284    * |       v           |   .       |       .       .       .       |
1285    * |                   `-------------------------------------------|
1286    * |                               |                               |
1287    * |                               |                               |
1288    * |                               |                               |
1289    * |                               |                               |
1290    * |                               |                               |
1291    * `---------------------------------------------------------------'
1292    *
1293    * Imagine this also to be the memory map of DST, which then would be
1294    * mirrored in the diagonal line from the top/right to the bottom/left.
1295    * Then each of the squares P, Q, R and S lay entirely inside one
1296    * memory block in both the source as well as the destination matrix.
1297    * P and Q are really the same block for matrix A (as are R and S),
1298    * while P and R (and Q and S) are really the same block for DST.
1299    *
1300    * We're going to run over the top/right corners of each of these
1301    * memory "blocks" and then transpose it, one by one, running
1302    * from right to left and top to bottom. The last one (R) will be
1303    * done by the calling function, so we just return when we get there.
1304    */
1305 
1306   rci_t R_top = (nrows >> A->blockrows_log) << A->blockrows_log;
1307   rci_t R_right = (ncols >> DST->blockrows_log) << DST->blockrows_log;
1308   for (rci_t col = 0; col < ncols; col += blockrows_dst) {
1309     rci_t end = (col == R_right) ? R_top : nrows;
1310     for (rci_t row = 0; row < end; row += blockrows_src) {
1311 
1312       rci_t nrowsb = (row < R_top) ? blockrows_src : (nrows - R_top);
1313       rci_t ncolsb = (col < R_right) ? blockrows_dst : (ncols - R_right);
1314       word const* RESTRICT fws = mzd_row(A, row) + col / m4ri_radix;
1315       word* RESTRICT fwd = mzd_row(DST, col) + row / m4ri_radix;
1316 
1317       // The following code is (almost) duplicated from _mzd_transpose.
1318       if (nrowsb >= 64) {
1319 
1320 	int js = ncolsb & nrowsb & 64;	// True if the total number of whole 64x64 matrices is odd.
1321 	wi_t const rowstride_64_dst = 64 * DST->rowstride;
1322 	word* RESTRICT fwd_current = fwd;
1323 	word const* RESTRICT fws_current = fws;
1324 	if (js) {
1325 	  js = 1;
1326 	  _mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
1327 	  fwd_current += rowstride_64_dst;
1328 	  ++fws_current;
1329 	}
1330 	rci_t const whole_64cols = ncolsb / 64;
1331 	// The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
1332 	// This way it can be inlined without duplicating the amount of code that has to be loaded.
1333 	word* RESTRICT fwd_delayed = NULL;
1334 	word const* RESTRICT fws_delayed = NULL;
1335 	int even = 0;
1336 	while (1)
1337 	{
1338 	  for (int j = js; j < whole_64cols; ++j) {
1339 	    if (!even) {
1340 	      fwd_delayed = fwd_current;
1341 	      fws_delayed = fws_current;
1342 	    } else {
1343 	      _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
1344 	    }
1345 	    fwd_current += rowstride_64_dst;
1346 	    ++fws_current;
1347 	    even = !even;
1348 	  }
1349 	  nrowsb -= 64;
1350 	  if (ncolsb % 64) {
1351 	    _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncolsb % 64);
1352 	  }
1353 	  fwd += 1;
1354 	  fws += 64 * A->rowstride;
1355 	  if (nrowsb < 64)
1356 	    break;
1357 	  js = 0;
1358 	  fws_current = fws;
1359 	  fwd_current = fwd;
1360 	}
1361       }
1362 
1363       if (nrowsb == 0)
1364 	continue;
1365 
1366       // Transpose the remaining top rows. Now 0 < nrowsb < 64.
1367 
1368       while (ncolsb >= 64)
1369       {
1370 	_mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrowsb);
1371 	ncolsb -= 64;
1372 	fwd += 64 * DST->rowstride;
1373 	fws += 1;
1374       }
1375 
1376       // This is true because if it wasn't then nrowsb has to be 0 and we continued before already.
1377       assert(ncolsb == 0);
1378     }
1379   }
1380 
1381   *nrowsp = nrows - R_top;
1382   *ncolsp = ncols - R_right;
1383   if (R_top < nrows)
1384     *fwsp = mzd_row(A, R_top) + R_right / m4ri_radix;
1385   if (R_right < ncols)
1386     *fwdp = mzd_row(DST, R_right) + R_top / m4ri_radix;
1387 }
1388 
mzd_transpose(mzd_t * DST,mzd_t const * A)1389 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A) {
1390   if (DST == NULL) {
1391     DST = mzd_init( A->ncols, A->nrows );
1392   } else if (__M4RI_UNLIKELY(DST->nrows != A->ncols || DST->ncols != A->nrows)) {
1393     m4ri_die("mzd_transpose: Wrong size for return matrix.\n");
1394   } else {
1395     /** it seems this is taken care of in the subroutines, re-enable if running into problems **/
1396     //mzd_set_ui(DST,0);
1397   }
1398 
1399   if(A->nrows == 0 || A->ncols == 0)
1400     return mzd_copy(DST, A);
1401 
1402   if (__M4RI_LIKELY(!mzd_is_windowed(DST) && !mzd_is_windowed(A)))
1403     return _mzd_transpose(DST, A);
1404   int A_windowed = mzd_is_windowed(A);
1405   if (A_windowed)
1406     A = mzd_copy(NULL, A);
1407   if (__M4RI_LIKELY(!mzd_is_windowed(DST)))
1408     _mzd_transpose(DST, A);
1409   else {
1410     mzd_t *D = mzd_init(DST->nrows, DST->ncols);
1411     _mzd_transpose(D, A);
1412     mzd_copy(DST, D);
1413     mzd_free(D);
1414   }
1415   if (A_windowed)
1416     mzd_free((mzd_t*)A);
1417   return DST;
1418 }
1419 
mzd_mul_naive(mzd_t * C,mzd_t const * A,mzd_t const * B)1420 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1421   if (C == NULL) {
1422     C = mzd_init(A->nrows, B->ncols);
1423   } else {
1424     if (C->nrows != A->nrows || C->ncols != B->ncols) {
1425       m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
1426     }
1427   }
1428   if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
1429     mzd_t *BT = mzd_transpose(NULL, B);
1430     _mzd_mul_naive(C, A, BT, 1);
1431     mzd_free (BT);
1432   } else {
1433     _mzd_mul_va(C, A, B, 1);
1434   }
1435   return C;
1436 }
1437 
mzd_addmul_naive(mzd_t * C,mzd_t const * A,mzd_t const * B)1438 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1439   if (C->nrows != A->nrows || C->ncols != B->ncols) {
1440     m4ri_die("mzd_addmul_naive: Provided return matrix has wrong dimensions.\n");
1441   }
1442 
1443   if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
1444     mzd_t *BT = mzd_transpose(NULL, B);
1445     _mzd_mul_naive(C, A, BT, 0);
1446     mzd_free (BT);
1447   } else {
1448     _mzd_mul_va(C, A, B, 0);
1449   }
1450   return C;
1451 }
1452 
_mzd_mul_naive(mzd_t * C,mzd_t const * A,mzd_t const * B,const int clear)1453 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, const int clear) {
1454   wi_t eol;
1455   word *a, *b, *c;
1456 
1457   if (clear) {
1458     word const mask_end = C->high_bitmask;
1459     /* improves performance on x86_64 but is not cross plattform */
1460     /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1461     for (rci_t i = 0; i < C->nrows; ++i) {
1462       wi_t j = 0;
1463       for (; j < C->width - 1; ++j) {
1464   	C->rows[i][j] = 0;
1465       }
1466       C->rows[i][j] &= ~mask_end;
1467     }
1468   }
1469 
1470   if(C->ncols % m4ri_radix) {
1471     eol = (C->width - 1);
1472   } else {
1473     eol = (C->width);
1474   }
1475 
1476   word parity[64];
1477   for (int i = 0; i < 64; ++i) {
1478     parity[i] = 0;
1479   }
1480   wi_t const wide = A->width;
1481   int const blocksize = __M4RI_MUL_BLOCKSIZE;
1482   for (rci_t start = 0; start + blocksize <= C->nrows; start += blocksize) {
1483     for (rci_t i = start; i < start + blocksize; ++i) {
1484       a = A->rows[i];
1485       c = C->rows[i];
1486       for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
1487 	for (int k = 0; k < m4ri_radix; ++k) {
1488           b = B->rows[j + k];
1489           parity[k] = a[0] & b[0];
1490           for (wi_t ii = wide - 1; ii >= 1; --ii)
1491 	    parity[k] ^= a[ii] & b[ii];
1492         }
1493         c[j / m4ri_radix] ^= m4ri_parity64(parity);
1494       }
1495 
1496       if (eol != C->width) {
1497 	word const mask_end = C->high_bitmask;
1498         /* improves performance on x86_64 but is not cross plattform */
1499 	/* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1500         for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
1501           b = B->rows[m4ri_radix * eol + k];
1502           parity[k] = a[0] & b[0];
1503           for (wi_t ii = 1; ii < A->width; ++ii)
1504             parity[k] ^= a[ii] & b[ii];
1505         }
1506         c[eol] ^= m4ri_parity64(parity) & mask_end;
1507       }
1508     }
1509   }
1510 
1511   for (rci_t i = C->nrows - (C->nrows % blocksize); i < C->nrows; ++i) {
1512     a = A->rows[i];
1513     c = C->rows[i];
1514     for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
1515       for (int k = 0; k < m4ri_radix; ++k) {
1516         b = B->rows[j+k];
1517         parity[k] = a[0] & b[0];
1518         for (wi_t ii = wide - 1; ii >= 1; --ii)
1519           parity[k] ^= a[ii] & b[ii];
1520       }
1521       c[j/m4ri_radix] ^= m4ri_parity64(parity);
1522     }
1523 
1524     if (eol != C->width) {
1525       word const mask_end = C->high_bitmask;
1526       /* improves performance on x86_64 but is not cross plattform */
1527       /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1528       for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
1529         b = B->rows[m4ri_radix * eol + k];
1530         parity[k] = a[0] & b[0];
1531         for (wi_t ii = 1; ii < A->width; ++ii)
1532           parity[k] ^= a[ii] & b[ii];
1533       }
1534       c[eol] ^= m4ri_parity64(parity) & mask_end;
1535     }
1536   }
1537 
1538   __M4RI_DD_MZD(C);
1539   return C;
1540 }
1541 
_mzd_mul_va(mzd_t * C,mzd_t const * v,mzd_t const * A,int const clear)1542 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear) {
1543   if(clear)
1544     mzd_set_ui(C, 0);
1545 
1546   rci_t const m = v->nrows;
1547   rci_t const n = v->ncols;
1548 
1549   for(rci_t i = 0; i < m; ++i)
1550     for(rci_t j = 0; j < n; ++j)
1551       if (mzd_read_bit(v,i,j))
1552         mzd_combine(C,i,0, C,i,0, A,j,0);
1553 
1554   __M4RI_DD_MZD(C);
1555   return C;
1556 }
1557 
mzd_randomize(mzd_t * A)1558 void mzd_randomize(mzd_t *A) {
1559   wi_t const width = A->width - 1;
1560   word const mask_end = A->high_bitmask;
1561   for(rci_t i = 0; i < A->nrows; ++i) {
1562     for(wi_t j = 0; j < width; ++j)
1563       A->rows[i][j] = m4ri_random_word();
1564     A->rows[i][width] ^= (A->rows[i][width] ^ m4ri_random_word()) & mask_end;
1565   }
1566 
1567   __M4RI_DD_MZD(A);
1568 }
1569 
mzd_randomize_custom(mzd_t * A,m4ri_random_callback rc,void * data)1570 void mzd_randomize_custom(mzd_t *A, m4ri_random_callback rc, void* data) {
1571   wi_t const width = A->width - 1;
1572   word const mask_end = A->high_bitmask;
1573   for(rci_t i = 0; i < A->nrows; ++i) {
1574     for(wi_t j = 0; j < width; ++j)
1575       A->rows[i][j] = rc(data);
1576     A->rows[i][width] ^= (A->rows[i][width] ^ rc(data)) & mask_end;
1577   }
1578 
1579   __M4RI_DD_MZD(A);
1580 }
1581 
mzd_set_ui(mzd_t * A,unsigned int value)1582 void mzd_set_ui( mzd_t *A, unsigned int value) {
1583   word const mask_end = A->high_bitmask;
1584 
1585   for (rci_t i = 0; i < A->nrows; ++i) {
1586     word *row = A->rows[i];
1587     for(wi_t j = 0; j < A->width - 1; ++j)
1588       row[j] = 0;
1589     row[A->width - 1] &= ~mask_end;
1590   }
1591 
1592   if(value % 2 == 0) {
1593     __M4RI_DD_MZD(A);
1594     return;
1595   }
1596 
1597   rci_t const stop = MIN(A->nrows, A->ncols);
1598   for (rci_t i = 0; i < stop; ++i) {
1599     mzd_write_bit(A, i, i, 1);
1600   }
1601 
1602   __M4RI_DD_MZD(A);
1603 }
1604 
mzd_equal(mzd_t const * A,mzd_t const * B)1605 int mzd_equal(mzd_t const *A, mzd_t const *B) {
1606   if (A->nrows != B->nrows) return FALSE;
1607   if (A->ncols != B->ncols) return FALSE;
1608   if (A == B) return TRUE;
1609 
1610   wi_t Awidth = A->width - 1;
1611 
1612   for (rci_t i = 0; i < A->nrows; ++i) {
1613     for (wi_t j = 0; j < Awidth; ++j) {
1614       if (A->rows[i][j] != B->rows[i][j])
1615         return FALSE;
1616     }
1617   }
1618 
1619   word const mask_end = A->high_bitmask;
1620   for (rci_t i = 0; i < A->nrows; ++i) {
1621     if (((A->rows[i][Awidth] ^ B->rows[i][Awidth]) & mask_end))
1622       return FALSE;
1623   }
1624   return TRUE;
1625 }
1626 
mzd_cmp(mzd_t const * A,mzd_t const * B)1627 int mzd_cmp(mzd_t const *A, mzd_t const *B) {
1628   if(A->nrows < B->nrows) return -1;
1629   if(B->nrows < A->nrows) return 1;
1630   if(A->ncols < B->ncols) return -1;
1631   if(B->ncols < A->ncols) return 1;
1632 
1633   const word mask_end   = A->high_bitmask;
1634   const wi_t n = A->width-1;
1635 
1636   /* Columns with large index are "larger", but rows with small index
1637      are more important than with large index. */
1638 
1639   for(rci_t i=0; i<A->nrows; i++) {
1640     if ((A->rows[i][n]&mask_end) < (B->rows[i][n]&mask_end))
1641       return -1;
1642     else if ((A->rows[i][n]&mask_end) > (B->rows[i][n]&mask_end))
1643       return 1;
1644 
1645     for(wi_t j=n-1; j>=0; j--) {
1646       if (A->rows[i][j] < B->rows[i][j])
1647         return -1;
1648       else if (A->rows[i][j] > B->rows[i][j])
1649         return 1;
1650     }
1651   }
1652   return 0;
1653 }
1654 
mzd_copy(mzd_t * N,mzd_t const * P)1655 mzd_t *mzd_copy(mzd_t *N, mzd_t const *P) {
1656   if (N == P)
1657     return N;
1658 
1659   if (N == NULL) {
1660     N = mzd_init(P->nrows, P->ncols);
1661   } else {
1662     if (N->nrows < P->nrows || N->ncols < P->ncols)
1663       m4ri_die("mzd_copy: Target matrix is too small.");
1664   }
1665   word *p_truerow, *n_truerow;
1666   wi_t const wide = P->width - 1;
1667   word mask_end = P->high_bitmask;
1668   for (rci_t i = 0; i < P->nrows; ++i) {
1669     p_truerow = P->rows[i];
1670     n_truerow = N->rows[i];
1671     for (wi_t j = 0; j < wide; ++j)
1672       n_truerow[j] = p_truerow[j];
1673     n_truerow[wide] = (n_truerow[wide] & ~mask_end) | (p_truerow[wide] & mask_end);
1674   }
1675   __M4RI_DD_MZD(N);
1676   return N;
1677 }
1678 
1679 /* This is sometimes called augment */
mzd_concat(mzd_t * C,mzd_t const * A,mzd_t const * B)1680 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1681   if (A->nrows != B->nrows) {
1682     m4ri_die("mzd_concat: Bad arguments to concat!\n");
1683   }
1684 
1685   if (C == NULL) {
1686     C = mzd_init(A->nrows, A->ncols + B->ncols);
1687   } else if (C->nrows != A->nrows || C->ncols != (A->ncols + B->ncols)) {
1688     m4ri_die("mzd_concat: C has wrong dimension!\n");
1689   }
1690 
1691   for (rci_t i = 0; i < A->nrows; ++i) {
1692     word *dst_truerow = C->rows[i];
1693     word *src_truerow = A->rows[i];
1694     for (wi_t j = 0; j < A->width; ++j) {
1695       dst_truerow[j] = src_truerow[j];
1696     }
1697   }
1698 
1699   for (rci_t i = 0; i < B->nrows; ++i) {
1700     for (rci_t j = 0; j < B->ncols; ++j) {
1701       mzd_write_bit(C, i, j + A->ncols, mzd_read_bit(B, i, j));
1702     }
1703   }
1704 
1705   __M4RI_DD_MZD(C);
1706   return C;
1707 }
1708 
mzd_stack(mzd_t * C,mzd_t const * A,mzd_t const * B)1709 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1710   if (A->ncols != B->ncols) {
1711     m4ri_die("mzd_stack: A->ncols (%d) != B->ncols (%d)!\n", A->ncols, B->ncols);
1712   }
1713 
1714   if (C == NULL) {
1715     C = mzd_init(A->nrows + B->nrows, A->ncols);
1716   } else if (C->nrows != (A->nrows + B->nrows) || C->ncols != A->ncols) {
1717     m4ri_die("mzd_stack: C has wrong dimension!\n");
1718   }
1719 
1720   for(rci_t i = 0; i < A->nrows; ++i) {
1721     word *src_truerow = A->rows[i];
1722     word *dst_truerow = C->rows[i];
1723     for (wi_t j = 0; j < A->width; ++j) {
1724       dst_truerow[j] = src_truerow[j];
1725     }
1726   }
1727 
1728   for(rci_t i = 0; i < B->nrows; ++i) {
1729     word *dst_truerow = C->rows[A->nrows + i];
1730     word *src_truerow = B->rows[i];
1731     for (wi_t j = 0; j < B->width; ++j) {
1732       dst_truerow[j] = src_truerow[j];
1733     }
1734   }
1735 
1736   __M4RI_DD_MZD(C);
1737   return C;
1738 }
1739 
mzd_invert_naive(mzd_t * INV,mzd_t const * A,mzd_t const * I)1740 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I) {
1741   mzd_t *H;
1742 
1743   H = mzd_concat(NULL, A, I);
1744 
1745   rci_t x = mzd_echelonize_naive(H, TRUE);
1746 
1747   if (x == 0) {
1748     mzd_free(H);
1749     return NULL;
1750   }
1751 
1752   INV = mzd_submatrix(INV, H, 0, A->ncols, A->nrows, 2 * A->ncols);
1753 
1754   mzd_free(H);
1755 
1756   __M4RI_DD_MZD(INV);
1757   return INV;
1758 }
1759 
mzd_add(mzd_t * ret,mzd_t const * left,mzd_t const * right)1760 mzd_t *mzd_add(mzd_t *ret, mzd_t const *left, mzd_t const *right) {
1761   if (left->nrows != right->nrows || left->ncols != right->ncols) {
1762     m4ri_die("mzd_add: rows and columns must match.\n");
1763   }
1764   if (ret == NULL) {
1765     ret = mzd_init(left->nrows, left->ncols);
1766   } else if (ret != left) {
1767     if (ret->nrows != left->nrows || ret->ncols != left->ncols) {
1768       m4ri_die("mzd_add: rows and columns of returned matrix must match.\n");
1769     }
1770   }
1771   return _mzd_add(ret, left, right);
1772 }
1773 
_mzd_add(mzd_t * C,mzd_t const * A,mzd_t const * B)1774 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1775   rci_t const nrows = MIN(MIN(A->nrows, B->nrows), C->nrows);
1776 
1777   if (C == B) { //swap
1778     mzd_t const *tmp = A;
1779     A = B;
1780     B = tmp;
1781   }
1782 
1783   word const mask_end = C->high_bitmask;
1784 
1785   switch(A->width) {
1786   case 0:
1787     return C;
1788   case 1:
1789     for(rci_t i = 0; i < nrows; ++i) {
1790       C->rows[i][0] ^= ((A->rows[i][0] ^ B->rows[i][0] ^ C->rows[i][0]) & mask_end);
1791     }
1792     break;
1793   case 2:
1794     for(rci_t i = 0; i < nrows; ++i) {
1795       C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1796       C->rows[i][1] ^= ((A->rows[i][1] ^ B->rows[i][1] ^ C->rows[i][1]) & mask_end);
1797     }
1798     break;
1799   case 3:
1800     for(rci_t i = 0; i < nrows; ++i) {
1801       C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1802       C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1803       C->rows[i][2] ^= ((A->rows[i][2] ^ B->rows[i][2] ^ C->rows[i][2]) & mask_end);
1804     }
1805     break;
1806   case 4:
1807     for(rci_t i = 0; i < nrows; ++i) {
1808       C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1809       C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1810       C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1811       C->rows[i][3] ^= ((A->rows[i][3] ^ B->rows[i][3] ^ C->rows[i][3]) & mask_end);
1812     }
1813     break;
1814   case 5:
1815     for(rci_t i = 0; i < nrows; ++i) {
1816       C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1817       C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1818       C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1819       C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1820       C->rows[i][4] ^= ((A->rows[i][4] ^ B->rows[i][4] ^ C->rows[i][4]) & mask_end);
1821     }
1822     break;
1823   case 6:
1824     for(rci_t i = 0; i < nrows; ++i) {
1825       C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1826       C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1827       C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1828       C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1829       C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1830       C->rows[i][5] ^= ((A->rows[i][5] ^ B->rows[i][5] ^ C->rows[i][5]) & mask_end);
1831     }
1832     break;
1833   case 7:
1834     for(rci_t i = 0; i < nrows; ++i) {
1835       C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1836       C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1837       C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1838       C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1839       C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1840       C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
1841       C->rows[i][6] ^= ((A->rows[i][6] ^ B->rows[i][6] ^ C->rows[i][6]) & mask_end);
1842     }
1843     break;
1844   case 8:
1845     for(rci_t i = 0; i < nrows; ++i) {
1846       C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1847       C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1848       C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1849       C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1850       C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1851       C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
1852       C->rows[i][6] = A->rows[i][6] ^ B->rows[i][6];
1853       C->rows[i][7] ^= ((A->rows[i][7] ^ B->rows[i][7] ^ C->rows[i][7]) & mask_end);
1854     }
1855     break;
1856 
1857   default:
1858     for(rci_t i = 0; i < nrows; ++i) {
1859       mzd_combine_even(C,i,0, A,i,0, B,i,0);
1860     }
1861   }
1862 
1863   __M4RI_DD_MZD(C);
1864   return C;
1865 }
1866 
mzd_submatrix(mzd_t * S,mzd_t const * M,rci_t const startrow,rci_t const startcol,rci_t const endrow,rci_t const endcol)1867 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const startrow, rci_t const startcol, rci_t const endrow, rci_t const endcol) {
1868   rci_t const nrows = endrow - startrow;
1869   rci_t const ncols = endcol - startcol;
1870 
1871   if (S == NULL) {
1872     S = mzd_init(nrows, ncols);
1873   } else if( (S->nrows < nrows) | (S->ncols < ncols) ) {
1874     m4ri_die("mzd_submatrix: got S with dimension %d x %d but expected %d x %d\n", S->nrows, S->ncols, nrows, ncols);
1875   }
1876 
1877   if (startcol % m4ri_radix == 0) {
1878 
1879     wi_t const startword = startcol / m4ri_radix;
1880     /* we start at the beginning of a word */
1881     if(ncols / m4ri_radix != 0) {
1882       for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
1883         memcpy(S->rows[i], M->rows[x] + startword, sizeof(word) * (ncols / m4ri_radix));
1884       }
1885     }
1886     if (ncols % m4ri_radix) {
1887       word const mask_end = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
1888       for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
1889         /* process remaining bits */
1890         word temp = M->rows[x][startword + ncols / m4ri_radix] & mask_end;
1891         S->rows[i][ncols / m4ri_radix] = temp;
1892       }
1893     }
1894   } else {
1895     wi_t j;
1896     for(rci_t i=0; i<nrows; i++) {
1897       for(j=0; j+m4ri_radix<ncols; j+=m4ri_radix)
1898         S->rows[i][j/m4ri_radix] = mzd_read_bits(M, startrow+i, startcol+j, m4ri_radix);
1899       S->rows[i][j/m4ri_radix] &= ~S->high_bitmask;
1900       S->rows[i][j/m4ri_radix] |= mzd_read_bits(M, startrow+i, startcol+j, ncols - j) & S->high_bitmask;
1901     }
1902   }
1903   __M4RI_DD_MZD(S);
1904   return S;
1905 }
1906 
mzd_col_swap(mzd_t * M,rci_t const cola,rci_t const colb)1907 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb) {
1908   if (cola == colb)
1909     return;
1910 
1911   rci_t const _cola = cola;
1912   rci_t const _colb = colb;
1913 
1914   wi_t const a_word = _cola / m4ri_radix;
1915   wi_t const b_word = _colb / m4ri_radix;
1916 
1917   int const a_bit = _cola % m4ri_radix;
1918   int const b_bit = _colb % m4ri_radix;
1919 
1920   word* RESTRICT ptr = mzd_first_row(M);
1921   int max_bit = MAX(a_bit, b_bit);
1922   int count = mzd_rows_in_block(M, 0);
1923   assert(count > 0);
1924   int min_bit = a_bit + b_bit - max_bit;
1925   int block = 0;
1926   int offset = max_bit - min_bit;
1927   word mask = m4ri_one << min_bit;
1928 
1929   if (a_word == b_word) {
1930     while(1) {
1931       ptr += a_word;
1932       int fast_count = count / 4;
1933       int rest_count = count - 4 * fast_count;
1934       word xor[4];
1935       wi_t const rowstride = M->rowstride;
1936       while (fast_count--) {
1937 	xor[0] = ptr[0];
1938 	xor[1] = ptr[rowstride];
1939 	xor[2] = ptr[2 * rowstride];
1940 	xor[3] = ptr[3 * rowstride];
1941 	xor[0] ^= xor[0] >> offset;
1942 	xor[1] ^= xor[1] >> offset;
1943 	xor[2] ^= xor[2] >> offset;
1944 	xor[3] ^= xor[3] >> offset;
1945 	xor[0] &= mask;
1946 	xor[1] &= mask;
1947 	xor[2] &= mask;
1948 	xor[3] &= mask;
1949 	xor[0] |= xor[0] << offset;
1950 	xor[1] |= xor[1] << offset;
1951 	xor[2] |= xor[2] << offset;
1952 	xor[3] |= xor[3] << offset;
1953 	ptr[0] ^= xor[0];
1954 	ptr[rowstride] ^= xor[1];
1955 	ptr[2 * rowstride] ^= xor[2];
1956 	ptr[3 * rowstride] ^= xor[3];
1957 	ptr += 4 * rowstride;
1958       }
1959       while (rest_count--) {
1960 	word xor = *ptr;
1961 	xor ^= xor >> offset;
1962 	xor &= mask;
1963 	*ptr ^= xor | (xor << offset);
1964 	ptr += rowstride;
1965       }
1966       if ((count = mzd_rows_in_block(M, ++block)) <= 0)
1967 	break;
1968       ptr = mzd_first_row_next_block(M, block);
1969     }
1970   } else {
1971     word* RESTRICT min_ptr;
1972     wi_t max_offset;
1973     if (min_bit == a_bit) {
1974       min_ptr = ptr + a_word;
1975       max_offset = b_word - a_word;
1976     } else {
1977       min_ptr = ptr + b_word;
1978       max_offset = a_word - b_word;
1979     }
1980     while(1) {
1981       wi_t const rowstride = M->rowstride;
1982       while(count--) {
1983 	word xor = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
1984 	min_ptr[0] ^= xor;
1985 	min_ptr[max_offset] ^= xor << offset;
1986 	min_ptr += rowstride;
1987       }
1988       if ((count = mzd_rows_in_block(M, ++block)) <= 0)
1989 	break;
1990       ptr = mzd_first_row_next_block(M, block);
1991       if (min_bit == a_bit)
1992 	min_ptr = ptr + a_word;
1993       else
1994 	min_ptr = ptr + b_word;
1995     }
1996   }
1997 
1998   __M4RI_DD_MZD(M);
1999 }
2000 
mzd_is_zero(mzd_t const * A)2001 int mzd_is_zero(mzd_t const *A) {
2002   word status = 0;
2003   word mask_end = A->high_bitmask;
2004   for (rci_t i = 0; i < A->nrows; ++i) {
2005     for (wi_t j = 0; j < A->width - 1; ++j)
2006       status |= A->rows[i][j];
2007     status |= A->rows[i][A->width - 1] & mask_end;
2008     if(status)
2009       return 0;
2010   }
2011   return !status;
2012 }
2013 
mzd_copy_row(mzd_t * B,rci_t i,mzd_t const * A,rci_t j)2014 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j) {
2015   assert(B->ncols >= A->ncols);
2016   wi_t const width = MIN(B->width, A->width) - 1;
2017 
2018   word const *a = A->rows[j];
2019   word *b = B->rows[i];
2020 
2021   word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
2022 
2023   if (width != 0) {
2024     for(wi_t k = 0; k < width; ++k)
2025       b[k] = a[k];
2026     b[width] = (b[width] & ~mask_end) | (a[width] & mask_end);
2027 
2028   } else {
2029     b[0] = (a[0]& mask_end) | (b[0] & ~mask_end);
2030   }
2031 
2032   __M4RI_DD_ROW(B, i);
2033 }
2034 
2035 
mzd_find_pivot(mzd_t const * A,rci_t start_row,rci_t start_col,rci_t * r,rci_t * c)2036 int mzd_find_pivot(mzd_t const *A, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c) {
2037   rci_t const nrows = A->nrows;
2038   rci_t const ncols = A->ncols;
2039   word data = 0;
2040   rci_t row_candidate = 0;
2041   if(A->ncols - start_col < m4ri_radix) {
2042     for(rci_t j = start_col; j < A->ncols; j += m4ri_radix) {
2043       int const length = MIN(m4ri_radix, ncols - j);
2044       for(rci_t i = start_row; i < nrows; ++i) {
2045         word const curr_data = mzd_read_bits(A, i, j, length);
2046         if (m4ri_lesser_LSB(curr_data, data)) {
2047           row_candidate = i;
2048           data = curr_data;
2049         }
2050       }
2051       if(data) {
2052         *r = row_candidate;
2053         for(int l = 0; l < length; ++l) {
2054           if(__M4RI_GET_BIT(data, l)) {
2055             *c = j + l;
2056             break;
2057           }
2058         }
2059 	__M4RI_DD_RCI(*r);
2060 	__M4RI_DD_RCI(*c);
2061 	__M4RI_DD_INT(1);
2062         return 1;
2063       }
2064     }
2065   } else {
2066     /* we definitely have more than one word */
2067     /* handle first word */
2068     int const bit_offset = (start_col % m4ri_radix);
2069     wi_t const word_offset = start_col / m4ri_radix;
2070     word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-bit_offset);
2071     for(rci_t i = start_row; i < nrows; ++i) {
2072       word const curr_data = A->rows[i][word_offset] & mask_begin;
2073       if (m4ri_lesser_LSB(curr_data, data)) {
2074         row_candidate = i;
2075         data = curr_data;
2076         if(__M4RI_GET_BIT(data,bit_offset)) {
2077           break;
2078         }
2079       }
2080     }
2081     if(data) {
2082       *r = row_candidate;
2083       data >>= bit_offset;
2084       assert(data);
2085       for(int l = 0; l < (m4ri_radix - bit_offset); ++l) {
2086         if(__M4RI_GET_BIT(data, l)) {
2087           *c = start_col + l;
2088           break;
2089         }
2090       }
2091       __M4RI_DD_RCI(*r);
2092       __M4RI_DD_RCI(*c);
2093       __M4RI_DD_INT(1);
2094       return 1;
2095     }
2096     /* handle complete words */
2097     for(wi_t wi = word_offset + 1; wi < A->width - 1; ++wi) {
2098       for(rci_t i = start_row; i < nrows; ++i) {
2099         word const curr_data = A->rows[i][wi];
2100         if (m4ri_lesser_LSB(curr_data, data)) {
2101           row_candidate = i;
2102           data = curr_data;
2103           if(__M4RI_GET_BIT(data, 0))
2104             break;
2105         }
2106       }
2107       if(data) {
2108         *r = row_candidate;
2109         for(int l = 0; l < m4ri_radix; ++l) {
2110           if(__M4RI_GET_BIT(data, l)) {
2111             *c = wi * m4ri_radix + l;
2112             break;
2113           }
2114         }
2115 	__M4RI_DD_RCI(*r);
2116 	__M4RI_DD_RCI(*c);
2117 	__M4RI_DD_INT(1);
2118         return 1;
2119       }
2120     }
2121     /* handle last word */
2122     int const end_offset = (A->ncols % m4ri_radix) ? (A->ncols % m4ri_radix) : m4ri_radix;
2123     word const mask_end = __M4RI_LEFT_BITMASK(end_offset % m4ri_radix);
2124     wi_t wi = A->width - 1;
2125     for(rci_t i = start_row; i < nrows; ++i) {
2126       word const curr_data = A->rows[i][wi] & mask_end;
2127       if (m4ri_lesser_LSB(curr_data, data)) {
2128         row_candidate = i;
2129         data = curr_data;
2130         if(__M4RI_GET_BIT(data,0))
2131           break;
2132       }
2133     }
2134     if(data) {
2135       *r = row_candidate;
2136       for(int l = 0; l < end_offset; ++l) {
2137         if(__M4RI_GET_BIT(data, l)) {
2138           *c = wi * m4ri_radix + l;
2139           break;
2140         }
2141       }
2142       __M4RI_DD_RCI(*r);
2143       __M4RI_DD_RCI(*c);
2144       __M4RI_DD_INT(1);
2145       return 1;
2146     }
2147   }
2148   __M4RI_DD_RCI(*r);
2149   __M4RI_DD_RCI(*c);
2150   __M4RI_DD_INT(0);
2151   return 0;
2152 }
2153 
2154 
2155 #define MASK(c)    (((uint64_t)(-1)) / (__M4RI_TWOPOW(__M4RI_TWOPOW(c)) + 1))
2156 #define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (__M4RI_TWOPOW(c))) & MASK(c))
2157 
m4ri_bitcount(word w)2158 static inline int m4ri_bitcount(word w)  {
2159    uint64_t n = __M4RI_CONVERT_TO_UINT64_T(w);
2160    n = COUNT(n, 0);
2161    n = COUNT(n, 1);
2162    n = COUNT(n, 2);
2163    n = COUNT(n, 3);
2164    n = COUNT(n, 4);
2165    n = COUNT(n, 5);
2166    return (int)n;
2167 }
2168 
2169 
_mzd_density(mzd_t const * A,wi_t res,rci_t r,rci_t c)2170 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c) {
2171   size_t count = 0;
2172   size_t total = 0;
2173 
2174   if(A->width == 1) {
2175     for(rci_t i = r; i < A->nrows; ++i)
2176       for(rci_t j = c; j < A->ncols; ++j)
2177         if(mzd_read_bit(A, i, j))
2178           ++count;
2179     return ((double)count)/(1.0 * A->ncols * A->nrows);
2180   }
2181 
2182   if(res == 0)
2183     res = A->width / 100;
2184   if (res < 1)
2185     res = 1;
2186 
2187   for(rci_t i = r; i < A->nrows; ++i) {
2188     word *truerow = A->rows[i];
2189     for(rci_t j = c; j < m4ri_radix; ++j)
2190       if(mzd_read_bit(A, i, j))
2191         ++count;
2192     total += m4ri_radix;
2193 
2194     for(wi_t j = MAX(1, c / m4ri_radix); j < A->width - 1; j += res) {
2195       count += m4ri_bitcount(truerow[j]);
2196       total += m4ri_radix;
2197     }
2198     for(int j = 0; j < A->ncols % m4ri_radix; ++j)
2199       if(mzd_read_bit(A, i, m4ri_radix * (A->ncols / m4ri_radix) + j))
2200         ++count;
2201     total += A->ncols % m4ri_radix;
2202   }
2203 
2204   return (double)count / total;
2205 }
2206 
mzd_density(mzd_t const * A,wi_t res)2207 double mzd_density(mzd_t const *A, wi_t res) {
2208   return _mzd_density(A, res, 0, 0);
2209 }
2210 
mzd_first_zero_row(mzd_t const * A)2211 rci_t mzd_first_zero_row(mzd_t const *A) {
2212   word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
2213   wi_t const end = A->width - 1;
2214   word *row;
2215 
2216   for(rci_t i = A->nrows - 1; i >= 0; --i) {
2217     row = A->rows[i];
2218     word tmp = row[0];
2219     for (wi_t j = 1; j < end; ++j)
2220       tmp |= row[j];
2221     tmp |= row[end] & mask_end;
2222     if(tmp) {
2223       __M4RI_DD_INT(i + 1);
2224       return i + 1;
2225     }
2226   }
2227   __M4RI_DD_INT(0);
2228   return 0;
2229 }
2230 
mzd_extract_u(mzd_t * U,mzd_t const * A)2231 mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A) {
2232   rci_t k = MIN(A->nrows, A->ncols);
2233   if (U == NULL)
2234     U = mzd_submatrix(NULL, A, 0, 0, k, k);
2235   else
2236     assert(U->nrows == k && U->ncols == k);
2237   for(rci_t i=1; i<U->nrows; i++) {
2238     for(wi_t j=0; j<i/m4ri_radix; j++) {
2239       U->rows[i][j] = 0;
2240     }
2241     if(i%m4ri_radix)
2242       mzd_clear_bits(U, i, (i/m4ri_radix)*m4ri_radix, i%m4ri_radix);
2243   }
2244   return U;
2245 }
2246 
mzd_extract_l(mzd_t * L,mzd_t const * A)2247 mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A) {
2248   rci_t k = MIN(A->nrows, A->ncols);
2249   if (L == NULL)
2250     L = mzd_submatrix(NULL, A, 0, 0, k, k);
2251   else
2252     assert(L->nrows == k && L->ncols == k);
2253   for(rci_t i=0; i<L->nrows-1; i++) {
2254     if(m4ri_radix - (i+1)%m4ri_radix)
2255       mzd_clear_bits(L, i, i+1, m4ri_radix - (i+1)%m4ri_radix);
2256     for(wi_t j=(i/m4ri_radix+1); j<L->width; j++) {
2257       L->rows[i][j] = 0;
2258     }
2259   }
2260   return L;
2261 }
2262