1 /**
2  * \file mzd.h
3  * \brief Dense matrices over GF(2) represented as a bit field.
4  *
5  * \author Gregory Bard <bard@fordham.edu>
6  * \author Martin Albrecht <martinralbrecht+m4ri@googlemail.com>
7  * \author Carlo Wood <carlo@alinoe.com>
8  */
9 
10 #ifndef M4RI_MZD
11 #define M4RI_MZD
12 
13 /*******************************************************************
14 *
15 *                M4RI: Linear Algebra over GF(2)
16 *
17 *    Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
18 *    Copyright (C) 2008-2013 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
19 *    Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
20 *
21 *  Distributed under the terms of the GNU General Public License (GPL)
22 *  version 2 or higher.
23 *
24 *    This code is distributed in the hope that it will be useful,
25 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
26 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27 *    General Public License for more details.
28 *
29 *  The full text of the GPL is available at:
30 *
31 *                  http://www.gnu.org/licenses/
32 *
33 ********************************************************************/
34 
35 #ifdef HAVE_CONFIG_H
36 #include "config.h"
37 #endif
38 
39 #include <m4ri/m4ri_config.h>
40 
41 #include <assert.h>
42 #include <math.h>
43 #include <stdio.h>
44 
45 #if __M4RI_HAVE_SSE2
46 #include <emmintrin.h>
47 #endif
48 
49 #include <m4ri/misc.h>
50 #include <m4ri/debug_dump.h>
51 
52 /**
53  * Maximum number of words allocated for one mzd_t block.
54  *
55  * \note This value must fit in an int, even though it's type is size_t.
56  */
57 
58 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27)
59 
60 /**
61  * \brief Matrix multiplication block-ing dimension.
62  *
63  * Defines the number of rows of the matrix A that are
64  * processed as one block during the execution of a multiplication
65  * algorithm.
66  */
67 
68 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L3_CACHE))) / 2, 2048)
69 
70 /**
71  * \brief Data containers containing the values packed into words
72  */
73 
74 typedef struct {
75   size_t size; /*!< number of words */
76   word *begin; /*!< first word */
77   word *end;   /*!< last word */
78 } mzd_block_t;
79 
80 /**
81  * \brief Dense matrices over GF(2).
82  *
83  * The most fundamental data type in this library.
84  */
85 
86 typedef struct mzd_t {
87 
88   rci_t nrows; /*!< Number of rows. */
89   rci_t ncols; /*!< Number of columns. */
90   wi_t width;  /*!< Number of words with valid bits: width = ceil(ncols / m4ri_radix) */
91 
92   /**
93    * Offset in words between rows.
94    *
95    * rowstride = (width < mzd_paddingwidth || (width & 1) == 0) ? width : width + 1;
96    * where width is the width of the underlying non-windowed matrix.
97    */
98 
99   wi_t rowstride;
100 
101   /**
102    * Offset in words from start of block to first word.
103    *
104    * rows[0] = blocks[0].begin + offset_vector;
105    * This, together with rowstride, makes the rows array obsolete.
106    */
107 
108   wi_t offset_vector;
109 
110   wi_t row_offset; /*!< Number of rows to the first row counting from the start of the first block. */
111 
112   /**
113    * Booleans to speed up things.
114    *
115    * The bits have the following meaning:
116    *
117    * 1: Has non-zero excess.
118    * 2: Is windowed, but has zero offset.
119    * 3: Is windowed, but has zero excess.
120    * 4: Is windowed, but owns the blocks allocations.
121    * 5: Spans more than 1 block.
122    */
123 
124   uint8_t flags;
125 
126   /**
127    * blockrows_log = log2(blockrows);
128    * where blockrows is the number of rows in one block, which is a power of 2.
129    */
130 
131   uint8_t blockrows_log;
132 
133   /* ensures sizeof(mzd_t) == 64 */
134   uint8_t padding[62 - 2 * sizeof(rci_t) - 4 * sizeof(wi_t) - sizeof(word) - 2 * sizeof(void *)];
135 
136   word high_bitmask;   /*!< Mask for valid bits in the word with the highest index (width - 1). */
137   mzd_block_t *blocks; /*!< Pointers to the actual blocks of memory containing the values packed into words. */
138   word **rows;         /*!< Address of first word in each row, so the first word of row i is is m->rows[i] */
139 } mzd_t;
140 
141 /**
142  * \brief The minimum width where padding occurs.
143  */
144 static wi_t const mzd_paddingwidth = 1;
145 
146 /**
147  * \brief flag when ncols%64 == 0
148  */
149 
150 static uint8_t const mzd_flag_nonzero_excess = 0x2;
151 
152 /**
153  * \brief flag for windowed matrix
154  */
155 
156 static uint8_t const mzd_flag_windowed_zerooffset = 0x4;
157 
158 /**
159  * \brief flag for windowed matrix where ncols%64 == 0
160  */
161 
162 static uint8_t const mzd_flag_windowed_zeroexcess = 0x8;
163 
164 /**
165  * \brief flag for windowed matrix wich owns its memory
166  */
167 
168 static uint8_t const mzd_flag_windowed_ownsblocks = 0x10;
169 
170 /**
171  * \brief flag for multiply blocks
172  */
173 
174 static uint8_t const mzd_flag_multiple_blocks = 0x20;
175 
176 /**
177  * \brief Test if a matrix is windowed.
178  *
179  * \param M Matrix
180  *
181  * \return a non-zero value if the matrix is windowed, otherwise return zero.
182  */
mzd_is_windowed(mzd_t const * M)183 static inline int mzd_is_windowed(mzd_t const *M) { return M->flags & (mzd_flag_windowed_zerooffset); }
184 
185 /**
186  * \brief Test if this mzd_t should free blocks.
187  *
188  * \param M Matrix
189  *
190  * \return TRUE iff blocks is non-zero and should be freed upon a call to mzd_free.
191  */
mzd_owns_blocks(mzd_t const * M)192 static inline int mzd_owns_blocks(mzd_t const *M) {
193   return M->blocks && (!mzd_is_windowed(M) || ((M->flags & mzd_flag_windowed_ownsblocks)));
194 }
195 
196 /**
197  * \brief Get a pointer the first word.
198  *
199  * \param M Matrix
200  *
201  * \return a pointer to the first word of the first row.
202  */
203 
mzd_first_row(mzd_t const * M)204 static inline word *mzd_first_row(mzd_t const *M) {
205   word *result = M->blocks[0].begin + M->offset_vector;
206   assert(M->nrows == 0 || result == M->rows[0]);
207   return result;
208 }
209 
210 /**
211  * \brief Get a pointer to the first word in block n.
212  *
213  * Use mzd_first_row for block number 0.
214  *
215  * \param M Matrix
216  * \param n The block number. Must be larger than 0.
217  *
218  * \return a pointer to the first word of the first row in block n.
219  */
mzd_first_row_next_block(mzd_t const * M,int n)220 static inline word *mzd_first_row_next_block(mzd_t const *M, int n) {
221   assert(n > 0);
222   return M->blocks[n].begin + M->offset_vector - M->row_offset * M->rowstride;
223 }
224 
225 /**
226  * \brief Convert row to blocks index.
227  *
228  * \param M Matrix.
229  * \param row The row to convert.
230  *
231  * \return the block number that contains this row.
232  */
233 
mzd_row_to_block(mzd_t const * M,rci_t row)234 static inline int mzd_row_to_block(mzd_t const *M, rci_t row) { return (M->row_offset + row) >> M->blockrows_log; }
235 
236 /**
237  * \brief Total number of rows in this block.
238  *
239  * Should be called with a constant n=0, or with
240  * n > 0 when n is a variable, for optimization
241  * reasons.
242  *
243  * \param M Matrix
244  * \param n The block number.
245  *
246  * \return the total number of rows in this block.
247  */
248 
mzd_rows_in_block(mzd_t const * M,int n)249 static inline wi_t mzd_rows_in_block(mzd_t const *M, int n) {
250   if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
251     if (__M4RI_UNLIKELY(n == 0)) {
252       return (1 << M->blockrows_log) - M->row_offset;
253     } else {
254       int const last_block = mzd_row_to_block(M, M->nrows - 1);
255       if (n < last_block) {
256         return (1 << M->blockrows_log);
257       }
258       return M->nrows + M->row_offset - (n << M->blockrows_log);
259     }
260   }
261   return n ? 0 : M->nrows;
262 }
263 
264 /**
265  * \brief Number of rows in this block including r
266  *
267  * \param M Matrix
268  * \param r row
269  *
270  * \return the number of rows with index >= r in this block
271  */
272 
mzd_remaining_rows_in_block(mzd_t const * M,rci_t r)273 static inline wi_t mzd_remaining_rows_in_block(mzd_t const *M, rci_t r) {
274   const int n = mzd_row_to_block(M, r);
275   r = (r - (n << M->blockrows_log));
276   if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
277     if (__M4RI_UNLIKELY(n == 0)) {
278       return (1 << M->blockrows_log) - M->row_offset - r;
279     } else {
280       int const last_block = mzd_row_to_block(M, M->nrows - 1);
281       if (n < last_block) {
282         return (1 << M->blockrows_log) - r;
283       }
284       return M->nrows + M->row_offset - (n << M->blockrows_log) - r;
285     }
286   }
287   return n ? 0 : M->nrows - r;
288 }
289 
290 /**
291  * \brief Get pointer to first word of row.
292  *
293  * \param M Matrix
294  * \param row The row index.
295  *
296  * \return pointer to first word of the row.
297  */
298 
mzd_row(mzd_t const * M,rci_t row)299 static inline word *mzd_row(mzd_t const *M, rci_t row) {
300   wi_t big_vector = M->offset_vector + row * M->rowstride;
301   word *result    = M->blocks[0].begin + big_vector;
302   if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
303     int const n = (M->row_offset + row) >> M->blockrows_log;
304     result = M->blocks[n].begin + big_vector - n * (M->blocks[0].size / sizeof(word));
305   }
306   assert(result == M->rows[row]);
307   return result;
308 }
309 
310 /**
311  * \brief Create a new matrix of dimension r x c.
312  *
313  * Use mzd_free to kill it.
314  *
315  * \param r Number of rows
316  * \param c Number of columns
317  *
318  */
319 
320 mzd_t *mzd_init(rci_t const r, rci_t const c);
321 
322 /**
323  * \brief Free a matrix created with mzd_init.
324  *
325  * \param A Matrix
326  */
327 
328 void mzd_free(mzd_t *A);
329 
330 /**
331  * \brief Create a window/view into the matrix M.
332  *
333  * A matrix window for M is a meta structure on the matrix M. It is
334  * setup to point into the matrix so M \em must \em not be freed while the
335  * matrix window is used.
336  *
337  * This function puts the restriction on the provided parameters that
338  * all parameters must be within range for M which is not enforced
339  * currently .
340  *
341  * Use mzd_free_window to free the window.
342  *
343  * \param M Matrix
344  * \param lowr Starting row (inclusive)
345  * \param lowc Starting column (inclusive, must be multiple of m4ri_radix)
346  * \param highr End row (exclusive)
347  * \param highc End column (exclusive)
348  *
349  */
350 
351 mzd_t *mzd_init_window(mzd_t *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
352 
353 /**
354  * \brief Create a const window/view into a const matrix M.
355  *
356  * See mzd_init_window, but for constant M.
357  */
358 
mzd_init_window_const(mzd_t const * M,rci_t const lowr,rci_t const lowc,rci_t const highr,rci_t const highc)359 static inline mzd_t const *mzd_init_window_const(mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr,
360                                                  rci_t const highc) {
361   return mzd_init_window((mzd_t *)M, lowr, lowc, highr, highc);
362 }
363 
364 /**
365  * \brief Free a matrix window created with mzd_init_window.
366  *
367  * \param A Matrix
368  */
369 
370 #define mzd_free_window mzd_free
371 
372 /**
373  * \brief Swap the two rows rowa and rowb starting at startblock.
374  *
375  * \param M Matrix with a zero offset.
376  * \param rowa Row index.
377  * \param rowb Row index.
378  * \param startblock Start swapping only in this block.
379  */
380 
_mzd_row_swap(mzd_t * M,rci_t const rowa,rci_t const rowb,wi_t const startblock)381 static inline void _mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb, wi_t const startblock) {
382   if ((rowa == rowb) || (startblock >= M->width)) {
383     return;
384   }
385 
386   wi_t width = M->width - startblock - 1;
387   word *a    = M->rows[rowa] + startblock;
388   word *b    = M->rows[rowb] + startblock;
389   word tmp;
390   word const mask_end = M->high_bitmask;
391 
392   for (wi_t i = 0; i < width; ++i) {
393     tmp  = a[i];
394     a[i] = b[i];
395     b[i] = tmp;
396   }
397   tmp = (a[width] ^ b[width]) & mask_end;
398   a[width] ^= tmp;
399   b[width] ^= tmp;
400 
401   __M4RI_DD_ROW(M, rowa);
402   __M4RI_DD_ROW(M, rowb);
403 }
404 
405 /**
406  * \brief Swap the two rows rowa and rowb.
407  *
408  * \param M Matrix
409  * \param rowa Row index.
410  * \param rowb Row index.
411  */
412 
mzd_row_swap(mzd_t * M,rci_t const rowa,rci_t const rowb)413 static inline void mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb) { _mzd_row_swap(M, rowa, rowb, 0); }
414 
415 /**
416  * \brief copy row j from A to row i from B.
417  *
418  * The offsets of A and B must match and the number of columns of A
419  * must be less than or equal to the number of columns of B.
420  *
421  * \param B Target matrix.
422  * \param i Target row index.
423  * \param A Source matrix.
424  * \param j Source row index.
425  */
426 
427 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j);
428 
429 /**
430  * \brief Swap the two columns cola and colb.
431  *
432  * \param M Matrix.
433  * \param cola Column index.
434  * \param colb Column index.
435  */
436 
437 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb);
438 
439 /**
440  * \brief Swap the two columns cola and colb but only between start_row and stop_row.
441  *
442  * \param M Matrix.
443  * \param cola Column index.
444  * \param colb Column index.
445  * \param start_row Row index.
446  * \param stop_row Row index (exclusive).
447  */
448 
mzd_col_swap_in_rows(mzd_t * M,rci_t const cola,rci_t const colb,rci_t const start_row,rci_t const stop_row)449 static inline void mzd_col_swap_in_rows(mzd_t *M, rci_t const cola, rci_t const colb, rci_t const start_row,
450                                         rci_t const stop_row) {
451   if (cola == colb) {
452     return;
453   }
454 
455   rci_t const _cola = cola;
456   rci_t const _colb = colb;
457 
458   wi_t const a_word = _cola / m4ri_radix;
459   wi_t const b_word = _colb / m4ri_radix;
460 
461   int const a_bit = _cola % m4ri_radix;
462   int const b_bit = _colb % m4ri_radix;
463 
464   word *RESTRICT ptr  = mzd_row(M, start_row);
465   int max_bit         = MAX(a_bit, b_bit);
466   int count_remaining = stop_row - start_row;
467   int min_bit         = a_bit + b_bit - max_bit;
468   int block           = mzd_row_to_block(M, start_row);
469   int offset          = max_bit - min_bit;
470   word mask           = m4ri_one << min_bit;
471   int count = MIN(mzd_remaining_rows_in_block(M, start_row), count_remaining);
472 
473   // Apparently we're calling with start_row == stop_row sometimes (seems a bug to me).
474   if (count <= 0) {
475     return;
476   }
477 
478   if (a_word == b_word) {
479     while (1) {
480       count_remaining -= count;
481       ptr += a_word;
482       int fast_count = count / 4;
483       int rest_count = count - 4 * fast_count;
484       word xor_v[4];
485       wi_t const rowstride = M->rowstride;
486       while (fast_count--) {
487         xor_v[0] = ptr[0];
488         xor_v[1] = ptr[rowstride];
489         xor_v[2] = ptr[2 * rowstride];
490         xor_v[3] = ptr[3 * rowstride];
491         xor_v[0] ^= xor_v[0] >> offset;
492         xor_v[1] ^= xor_v[1] >> offset;
493         xor_v[2] ^= xor_v[2] >> offset;
494         xor_v[3] ^= xor_v[3] >> offset;
495         xor_v[0] &= mask;
496         xor_v[1] &= mask;
497         xor_v[2] &= mask;
498         xor_v[3] &= mask;
499         xor_v[0] |= xor_v[0] << offset;
500         xor_v[1] |= xor_v[1] << offset;
501         xor_v[2] |= xor_v[2] << offset;
502         xor_v[3] |= xor_v[3] << offset;
503         ptr[0] ^= xor_v[0];
504         ptr[rowstride] ^= xor_v[1];
505         ptr[2 * rowstride] ^= xor_v[2];
506         ptr[3 * rowstride] ^= xor_v[3];
507         ptr += 4 * rowstride;
508       }
509       while (rest_count--) {
510         word xor_v = *ptr;
511         xor_v ^= xor_v >> offset;
512         xor_v &= mask;
513         *ptr ^= xor_v | (xor_v << offset);
514         ptr += rowstride;
515       }
516       block++;
517       if ((count = MIN(mzd_rows_in_block(M, block), count_remaining)) <= 0) {
518         break;
519       }
520       ptr = mzd_first_row_next_block(M, block);
521     }
522   } else {
523     word *RESTRICT min_ptr;
524     wi_t max_offset;
525     if (min_bit == a_bit) {
526       min_ptr    = ptr + a_word;
527       max_offset = b_word - a_word;
528     } else {
529       min_ptr    = ptr + b_word;
530       max_offset = a_word - b_word;
531     }
532     while (1) {
533       count_remaining -= count;
534       wi_t const rowstride = M->rowstride;
535       while (count--) {
536         word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
537         min_ptr[0] ^= xor_v;
538         min_ptr[max_offset] ^= xor_v << offset;
539         min_ptr += rowstride;
540       }
541       block++;
542       if ((count = MIN(mzd_rows_in_block(M, +block), count_remaining)) <= 0) {
543         break;
544       }
545       ptr = mzd_first_row_next_block(M, block);
546       if (min_bit == a_bit) {
547         min_ptr = ptr + a_word;
548       } else {
549         min_ptr = ptr + b_word;
550       }
551     }
552   }
553 
554   __M4RI_DD_MZD(M);
555 }
556 
557 /**
558  * \brief Read the bit at position M[row,col].
559  *
560  * \param M Matrix
561  * \param row Row index
562  * \param col Column index
563  *
564  * \note No bounds checks whatsoever are performed.
565  *
566  */
567 
mzd_read_bit(mzd_t const * M,rci_t const row,rci_t const col)568 static inline BIT mzd_read_bit(mzd_t const *M, rci_t const row, rci_t const col) {
569   return __M4RI_GET_BIT(M->rows[row][col / m4ri_radix], col % m4ri_radix);
570 }
571 
572 /**
573  * \brief Write the bit value to position M[row,col]
574  *
575  * \param M Matrix
576  * \param row Row index
577  * \param col Column index
578  * \param value Either 0 or 1
579  *
580  * \note No bounds checks whatsoever are performed.
581  *
582  */
583 
mzd_write_bit(mzd_t * M,rci_t const row,rci_t const col,BIT const value)584 static inline void mzd_write_bit(mzd_t *M, rci_t const row, rci_t const col, BIT const value) {
585   __M4RI_WRITE_BIT(M->rows[row][col / m4ri_radix], col % m4ri_radix, value);
586 }
587 
588 /**
589  * \brief XOR n bits from values to M starting a position (x,y).
590  *
591  * \param M Source matrix.
592  * \param x Starting row.
593  * \param y Starting column.
594  * \param n Number of bits (<= m4ri_radix);
595  * \param values Word with values;
596  */
597 
mzd_xor_bits(mzd_t const * M,rci_t const x,rci_t const y,int const n,word values)598 static inline void mzd_xor_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
599   int const spot   = y % m4ri_radix;
600   wi_t const block = y / m4ri_radix;
601   M->rows[x][block] ^= values << spot;
602   int const space = m4ri_radix - spot;
603   if (n > space) {
604     M->rows[x][block + 1] ^= values >> space;
605   }
606 }
607 
608 /**
609  * \brief AND n bits from values to M starting a position (x,y).
610  *
611  * \param M Source matrix.
612  * \param x Starting row.
613  * \param y Starting column.
614  * \param n Number of bits (<= m4ri_radix);
615  * \param values Word with values;
616  */
617 
mzd_and_bits(mzd_t const * M,rci_t const x,rci_t const y,int const n,word values)618 static inline void mzd_and_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
619   /* This is the best way, since this will drop out once we inverse the bits in values: */
620   values >>= (m4ri_radix - n); /* Move the bits to the lowest columns */
621 
622   int const spot   = y % m4ri_radix;
623   wi_t const block = y / m4ri_radix;
624   M->rows[x][block] &= values << spot;
625   int const space = m4ri_radix - spot;
626   if (n > space) {
627     M->rows[x][block + 1] &= values >> space;
628   }
629 }
630 
631 /**
632  * \brief Clear n bits in M starting a position (x,y).
633  *
634  * \param M Source matrix.
635  * \param x Starting row.
636  * \param y Starting column.
637  * \param n Number of bits (0 < n <= m4ri_radix);
638  */
639 
mzd_clear_bits(mzd_t const * M,rci_t const x,rci_t const y,int const n)640 static inline void mzd_clear_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
641   assert(n > 0 && n <= m4ri_radix);
642   word values      = m4ri_ffff >> (m4ri_radix - n);
643   int const spot   = y % m4ri_radix;
644   wi_t const block = y / m4ri_radix;
645   M->rows[x][block] &= ~(values << spot);
646   int const space = m4ri_radix - spot;
647   if (n > space) {
648     M->rows[x][block + 1] &= ~(values >> space);
649   }
650 }
651 
652 /**
653  * \brief Add the rows sourcerow and destrow and stores the total in the row
654  * destrow, but only begins at the column coloffset.
655  *
656  * \param M Matrix
657  * \param dstrow Index of target row
658  * \param srcrow Index of source row
659  * \param coloffset Start column (0 <= coloffset < M->ncols)
660  *
661  * \warning This function expects that there is at least one word worth of work.
662  */
663 
mzd_row_add_offset(mzd_t * M,rci_t dstrow,rci_t srcrow,rci_t coloffset)664 static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_t coloffset) {
665   assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols);
666   wi_t const startblock = coloffset / m4ri_radix;
667   wi_t wide             = M->width - startblock;
668   word *src             = M->rows[srcrow] + startblock;
669   word *dst             = M->rows[dstrow] + startblock;
670   word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset % m4ri_radix);
671   word const mask_end   = M->high_bitmask;
672 
673   *dst++ ^= *src++ & mask_begin;
674   --wide;
675 
676 #if __M4RI_HAVE_SSE2
677   int not_aligned = __M4RI_ALIGNMENT(src, 16) != 0; /* 0: Aligned, 1: Not aligned */
678   if (wide > not_aligned + 1)                       /* Speed up for small matrices */
679   {
680     if (not_aligned) {
681       *dst++ ^= *src++;
682       --wide;
683     }
684     /* Now wide > 1 */
685     __m128i *__src     = (__m128i *)src;
686     __m128i *__dst     = (__m128i *)dst;
687     __m128i *const eof = (__m128i *)((unsigned long)(src + wide) & ~0xFUL);
688     do {
689       __m128i xmm1 = _mm_xor_si128(*__dst, *__src);
690       *__dst++     = xmm1;
691     } while (++__src < eof);
692     src  = (word *)__src;
693     dst  = (word *)__dst;
694     wide = ((sizeof(word) * wide) % 16) / sizeof(word);
695   }
696 #endif
697   wi_t i = -1;
698   while (++i < wide) {
699     dst[i] ^= src[i];
700   }
701   /*
702    * Revert possibly non-zero excess bits.
703    * Note that i == wide here, and wide can be 0.
704    * But really, src[wide - 1] is M->rows[srcrow][M->width - 1] ;)
705    * We use i - 1 here to let the compiler know these are the same addresses
706    * that we last accessed, in the previous loop.
707    */
708   dst[i - 1] ^= src[i - 1] & ~mask_end;
709 
710   __M4RI_DD_ROW(M, dstrow);
711 }
712 
713 /**
714  * \brief Add the rows sourcerow and destrow and stores the total in
715  * the row destrow.
716  *
717  * \param M Matrix
718  * \param sourcerow Index of source row
719  * \param destrow Index of target row
720  *
721  * \note this can be done much faster with mzd_combine.
722  */
723 
724 void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow);
725 
726 /**
727  * \brief Transpose a matrix.
728  *
729  * This function uses the fact that:
730 \verbatim
731    [ A B ]T    [AT CT]
732    [ C D ]  =  [BT DT]
733  \endverbatim
734  * and thus rearranges the blocks recursively.
735  *
736  * \param DST Preallocated return matrix, may be NULL for automatic creation.
737  * \param A Matrix
738  */
739 
740 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A);
741 
742 /**
743  * \brief Naive cubic matrix multiplication.
744  *
745  * That is, compute C such that C == AB.
746  *
747  * \param C Preallocated product matrix, may be NULL for automatic creation.
748  * \param A Input matrix A.
749  * \param B Input matrix B.
750  *
751  * \note Normally, if you will multiply several times by b, it is
752  * smarter to calculate bT yourself, and keep it, and then use the
753  * function called _mzd_mul_naive
754  *
755  */
756 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
757 
758 /**
759  * \brief Naive cubic matrix multiplication and addition
760  *
761  * That is, compute C such that C == C + AB.
762  *
763  * \param C Preallocated product matrix.
764  * \param A Input matrix A.
765  * \param B Input matrix B.
766  *
767  * \note Normally, if you will multiply several times by b, it is
768  * smarter to calculate bT yourself, and keep it, and then use the
769  * function called _mzd_mul_naive
770  */
771 
772 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
773 
774 /**
775  * \brief Naive cubic matrix multiplication with the pre-transposed B.
776  *
777  * That is, compute C such that C == AB^t.
778  *
779  * \param C Preallocated product matrix.
780  * \param A Input matrix A.
781  * \param B Pre-transposed input matrix B.
782  * \param clear Whether to clear C before accumulating AB
783  */
784 
785 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, int const clear);
786 
787 /**
788  * \brief Matrix multiplication optimized for v*A where v is a vector.
789  *
790  * \param C Preallocated product matrix.
791  * \param v Input matrix v.
792  * \param A Input matrix A.
793  * \param clear If set clear C first, otherwise add result to C.
794  *
795  */
796 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear);
797 
798 /**
799  * \brief Fill matrix M with uniformly distributed bits.
800  *
801  * \param M Matrix
802  */
803 
804 void mzd_randomize(mzd_t *M);
805 
806 /**
807  * \brief Random callback that produces uniformly distributed random
808  * words on every call.
809  *
810  * \param data callback data
811  *
812  * \return uniformly distributed random word
813  */
814 typedef word (*m4ri_random_callback)(void* data);
815 
816 /**
817  * \brief Fill matrix M with uniformly distributed bits.
818  *
819  * \param M Matrix
820  * \param rc callback
821  * \param data callback data passed to every call to rc
822  */
823 void mzd_randomize_custom(mzd_t *M, m4ri_random_callback rc, void* data);
824 
825 /**
826  * \brief Set the matrix M to the value equivalent to the integer
827  * value provided.
828  *
829  * Specifically, this function does nothing if value%2 == 0 and
830  * returns the identity matrix if value%2 == 1.
831  *
832  * If the matrix is not square then the largest possible square
833  * submatrix is set to the identity matrix.
834  *
835  * \param M Matrix
836  * \param value Either 0 or 1
837  */
838 
839 void mzd_set_ui(mzd_t *M, unsigned int const value);
840 
841 /**
842  * \brief Gaussian elimination.
843  *
844  * This will do Gaussian elimination on the matrix m but will start
845  * not at column 0 necc but at column startcol. If full=FALSE, then it
846  * will do triangular style elimination, and if full=TRUE, it will do
847  * Gauss-Jordan style, or full elimination.
848  *
849  * \param M Matrix
850  * \param startcol First column to consider for reduction.
851  * \param full Gauss-Jordan style or upper triangular form only.
852  */
853 
854 rci_t mzd_gauss_delayed(mzd_t *M, rci_t const startcol, int const full);
855 
856 /**
857  * \brief Gaussian elimination.
858  *
859  * This will do Gaussian elimination on the matrix m.  If full=FALSE,
860  *  then it will do triangular style elimination, and if full=TRUE,
861  *  it will do Gauss-Jordan style, or full elimination.
862  *
863  * \param M Matrix
864  * \param full Gauss-Jordan style or upper triangular form only.
865  *
866  * \sa mzd_echelonize_m4ri(), mzd_echelonize_pluq()
867  */
868 
869 rci_t mzd_echelonize_naive(mzd_t *M, int const full);
870 
871 /**
872  * \brief Return TRUE if A == B.
873  *
874  * \param A Matrix
875  * \param B Matrix
876  */
877 
878 int mzd_equal(mzd_t const *A, mzd_t const *B);
879 
880 /**
881  * \brief Return -1,0,1 if if A < B, A == B or A > B respectively.
882  *
883  * \param A Matrix.
884  * \param B Matrix.
885  *
886  * \note This comparison is not well defined mathematically and
887  * relatively arbitrary since elements of GF(2) don't have an
888  * ordering.
889  */
890 
891 int mzd_cmp(mzd_t const *A, mzd_t const *B);
892 
893 /**
894  * \brief Copy matrix  A to DST.
895  *
896  * \param DST May be NULL for automatic creation.
897  * \param A Source matrix.
898  */
899 
900 mzd_t *mzd_copy(mzd_t *DST, mzd_t const *A);
901 
902 /**
903  * \brief Concatenate B to A and write the result to C.
904  *
905  * That is,
906  *
907  \verbatim
908  [ A ], [ B ] -> [ A  B ] = C
909  \endverbatim
910  *
911  * The inputs are not modified but a new matrix is created.
912  *
913  * \param C Matrix, may be NULL for automatic creation
914  * \param A Matrix
915  * \param B Matrix
916  *
917  * \note This is sometimes called augment.
918  */
919 
920 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B);
921 
922 /**
923  * \brief Stack A on top of B and write the result to C.
924  *
925  * That is,
926  *
927  \verbatim
928  [ A ], [ B ] -> [ A ] = C
929                  [ B ]
930  \endverbatim
931  *
932  * The inputs are not modified but a new matrix is created.
933  *
934  * \param C Matrix, may be NULL for automatic creation
935  * \param A Matrix
936  * \param B Matrix
937  */
938 
939 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B);
940 
941 /**
942  * \brief Copy a submatrix.
943  *
944  * Note that the upper bounds are not included.
945  *
946  * \param S Preallocated space for submatrix, may be NULL for automatic creation.
947  * \param M Matrix
948  * \param lowr start rows
949  * \param lowc start column
950  * \param highr stop row (this row is \em not included)
951  * \param highc stop column (this column is \em not included)
952  */
953 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr,
954                      rci_t const highc);
955 
956 /**
957  * \brief Invert the matrix target using Gaussian elimination.
958  *
959  * To avoid recomputing the identity matrix over and over again, I may
960  * be passed in as identity parameter.
961  *
962  * \param INV Preallocated space for inversion matrix, may be NULL for automatic creation.
963  * \param A Matrix to be reduced.
964  * \param I Identity matrix.
965  */
966 
967 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I);
968 
969 /**
970  * \brief Set C = A+B.
971  *
972  * C is also returned. If C is NULL then a new matrix is created which
973  * must be freed by mzd_free.
974  *
975  * \param C Preallocated sum matrix, may be NULL for automatic creation.
976  * \param A Matrix
977  * \param B Matrix
978  */
979 
980 mzd_t *mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
981 
982 /**
983  * \brief Same as mzd_add but without any checks on the input.
984  *
985  * \param C Preallocated sum matrix, may be NULL for automatic creation.
986  * \param A Matrix
987  * \param B Matrix
988  */
989 
990 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
991 
992 /**
993  * \brief Same as mzd_add.
994  *
995  * \param C Preallocated difference matrix, may be NULL for automatic creation.
996  * \param A Matrix
997  * \param B Matrix
998  */
999 
1000 #define mzd_sub mzd_add
1001 
1002 /**
1003  * \brief Same as mzd_sub but without any checks on the input.
1004  *
1005  * \param C Preallocated difference matrix, may be NULL for automatic creation.
1006  * \param A Matrix
1007  * \param B Matrix
1008  */
1009 
1010 #define _mzd_sub _mzd_add
1011 
1012 /**
1013  * Get n bits starting a position (x,y) from the matrix M.
1014  *
1015  * \param M Source matrix.
1016  * \param x Starting row.
1017  * \param y Starting column.
1018  * \param n Number of bits (<= m4ri_radix);
1019  */
1020 
mzd_read_bits(mzd_t const * M,rci_t const x,rci_t const y,int const n)1021 static inline word mzd_read_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
1022   int const spot   = y % m4ri_radix;
1023   wi_t const block = y / m4ri_radix;
1024   int const spill  = spot + n - m4ri_radix;
1025   word temp        = (spill <= 0) ? M->rows[x][block] << -spill
1026                            : (M->rows[x][block + 1] << (m4ri_radix - spill)) | (M->rows[x][block] >> spill);
1027   return temp >> (m4ri_radix - n);
1028 }
1029 
1030 /**
1031  * \brief a_row[a_startblock:] += b_row[b_startblock:] for offset 0
1032  *
1033  * Adds a_row of A, starting with a_startblock to the end, to
1034  * b_row of B, starting with b_startblock to the end. This gets stored
1035  * in A, in a_row, starting with a_startblock.
1036  *
1037  * \param A destination matrix
1038  * \param a_row destination row for matrix C
1039  * \param a_startblock starting block to work on in matrix C
1040  * \param B source matrix
1041  * \param b_row source row for matrix B
1042  * \param b_startblock starting block to work on in matrix B
1043  *
1044  */
1045 
mzd_combine_even_in_place(mzd_t * A,rci_t const a_row,wi_t const a_startblock,mzd_t const * B,rci_t const b_row,wi_t const b_startblock)1046 static inline void mzd_combine_even_in_place(mzd_t *A,       rci_t const a_row, wi_t const a_startblock,
1047                                              mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
1048 
1049   wi_t wide = A->width - a_startblock - 1;
1050 
1051   word *a = A->rows[a_row] + a_startblock;
1052   word *b = B->rows[b_row] + b_startblock;
1053 
1054 #if __M4RI_HAVE_SSE2
1055   if (wide > 2) {
1056     /** check alignments **/
1057     if (__M4RI_ALIGNMENT(a, 16)) {
1058       *a++ ^= *b++;
1059       wide--;
1060     }
1061 
1062     if (__M4RI_ALIGNMENT(a, 16) == 0 && __M4RI_ALIGNMENT(b, 16) == 0) {
1063       __m128i *a128      = (__m128i *)a;
1064       __m128i *b128      = (__m128i *)b;
1065       const __m128i *eof = (__m128i *)((unsigned long)(a + wide) & ~0xFUL);
1066 
1067       do {
1068         *a128 = _mm_xor_si128(*a128, *b128);
1069         ++b128;
1070         ++a128;
1071       } while (a128 < eof);
1072 
1073       a    = (word *)a128;
1074       b    = (word *)b128;
1075       wide = ((sizeof(word) * wide) % 16) / sizeof(word);
1076     }
1077   }
1078 #endif  // __M4RI_HAVE_SSE2
1079 
1080   if (wide > 0) {
1081     wi_t n = (wide + 7) / 8;
1082     switch (wide % 8) {
1083     case 0: do { *(a++) ^= *(b++);
1084     case 7:      *(a++) ^= *(b++);
1085     case 6:      *(a++) ^= *(b++);
1086     case 5:      *(a++) ^= *(b++);
1087     case 4:      *(a++) ^= *(b++);
1088     case 3:      *(a++) ^= *(b++);
1089     case 2:      *(a++) ^= *(b++);
1090     case 1:      *(a++) ^= *(b++);
1091     } while (--n > 0);
1092     }
1093   }
1094 
1095   *a ^= *b & A->high_bitmask;
1096 
1097   __M4RI_DD_MZD(A);
1098 }
1099 
1100 /**
1101  * \brief c_row[c_startblock:] = a_row[a_startblock:] + b_row[b_startblock:] for offset 0
1102  *
1103  * Adds a_row of A, starting with a_startblock to the end, to
1104  * b_row of B, starting with b_startblock to the end. This gets stored
1105  * in C, in c_row, starting with c_startblock.
1106  *
1107  * \param C destination matrix
1108  * \param c_row destination row for matrix C
1109  * \param c_startblock starting block to work on in matrix C
1110  * \param A source matrix
1111  * \param a_row source row for matrix A
1112  * \param a_startblock starting block to work on in matrix A
1113  * \param B source matrix
1114  * \param b_row source row for matrix B
1115  * \param b_startblock starting block to work on in matrix B
1116  *
1117  */
1118 
mzd_combine_even(mzd_t * C,rci_t const c_row,wi_t const c_startblock,mzd_t const * A,rci_t const a_row,wi_t const a_startblock,mzd_t const * B,rci_t const b_row,wi_t const b_startblock)1119 static inline void mzd_combine_even(mzd_t *C,       rci_t const c_row, wi_t const c_startblock,
1120                                     mzd_t const *A, rci_t const a_row, wi_t const a_startblock,
1121                                     mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
1122 
1123   wi_t wide = A->width - a_startblock - 1;
1124   word *a   = A->rows[a_row] + a_startblock;
1125   word *b   = B->rows[b_row] + b_startblock;
1126   word *c   = C->rows[c_row] + c_startblock;
1127 
1128 #if __M4RI_HAVE_SSE2
1129   if (wide > 2) {
1130     /** check alignments **/
1131     if (__M4RI_ALIGNMENT(a, 16)) {
1132       *c++ = *b++ ^ *a++;
1133       wide--;
1134     }
1135 
1136     if ((__M4RI_ALIGNMENT(b, 16) | __M4RI_ALIGNMENT(c, 16)) == 0) {
1137       __m128i *a128      = (__m128i *)a;
1138       __m128i *b128      = (__m128i *)b;
1139       __m128i *c128      = (__m128i *)c;
1140       const __m128i *eof = (__m128i *)((unsigned long)(a + wide) & ~0xFUL);
1141 
1142       do {
1143         *c128 = _mm_xor_si128(*a128, *b128);
1144         ++c128;
1145         ++b128;
1146         ++a128;
1147       } while (a128 < eof);
1148 
1149       a    = (word *)a128;
1150       b    = (word *)b128;
1151       c    = (word *)c128;
1152       wide = ((sizeof(word) * wide) % 16) / sizeof(word);
1153     }
1154   }
1155 #endif  // __M4RI_HAVE_SSE2
1156 
1157   if (wide > 0) {
1158     wi_t n = (wide + 7) / 8;
1159     switch (wide % 8) {
1160     case 0: do { *(c++) = *(a++) ^ *(b++);
1161     case 7:      *(c++) = *(a++) ^ *(b++);
1162     case 6:      *(c++) = *(a++) ^ *(b++);
1163     case 5:      *(c++) = *(a++) ^ *(b++);
1164     case 4:      *(c++) = *(a++) ^ *(b++);
1165     case 3:      *(c++) = *(a++) ^ *(b++);
1166     case 2:      *(c++) = *(a++) ^ *(b++);
1167     case 1:      *(c++) = *(a++) ^ *(b++);
1168     } while (--n > 0);
1169     }
1170   }
1171   *c ^= ((*a ^ *b ^ *c) & C->high_bitmask);
1172 
1173   __M4RI_DD_MZD(C);
1174 }
1175 
1176 /**
1177  * \brief row3[col3:] = row1[col1:] + row2[col2:]
1178  *
1179  * Adds row1 of SC1, starting with startblock1 to the end, to
1180  * row2 of SC2, starting with startblock2 to the end. This gets stored
1181  * in DST, in row3, starting with startblock3.
1182  *
1183  * \param C destination matrix
1184  * \param c_row destination row for matrix dst
1185  * \param c_startblock starting block to work on in matrix dst
1186  * \param A source matrix
1187  * \param a_row source row for matrix sc1
1188  * \param a_startblock starting block to work on in matrix sc1
1189  * \param B source matrix
1190  * \param b_row source row for matrix sc2
1191  * \param b_startblock starting block to work on in matrix sc2
1192  *
1193  */
mzd_combine(mzd_t * C,rci_t const c_row,wi_t const c_startblock,mzd_t const * A,rci_t const a_row,wi_t const a_startblock,mzd_t const * B,rci_t const b_row,wi_t const b_startblock)1194 static inline void mzd_combine(mzd_t *C,       rci_t const c_row, wi_t const c_startblock,
1195                                mzd_t const *A, rci_t const a_row, wi_t const a_startblock,
1196                                mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
1197 
1198   if ((C == A) & (a_row == c_row) & (a_startblock == c_startblock)) {
1199     mzd_combine_even_in_place(C, c_row, c_startblock, B, b_row, b_startblock);
1200   } else {
1201     mzd_combine_even(C, c_row, c_startblock, A, a_row, a_startblock, B, b_row, b_startblock);
1202   }
1203   return;
1204 }
1205 
1206 /**
1207  * \brief Get n bits starting a position (x,y) from the matrix M.
1208  *
1209  * This function is in principle the same as mzd_read_bits,
1210  * but it explicitely returns an 'int' and is used as
1211  * index into an array (Gray code).
1212  */
1213 
mzd_read_bits_int(mzd_t const * M,rci_t const x,rci_t const y,int const n)1214 static inline int mzd_read_bits_int(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
1215   return __M4RI_CONVERT_TO_INT(mzd_read_bits(M, x, y, n));
1216 }
1217 
1218 /**
1219  * \brief Zero test for matrix.
1220  *
1221  * \param A Input matrix.
1222  *
1223  */
1224 int mzd_is_zero(mzd_t const *A);
1225 
1226 /**
1227  * \brief Clear the given row, but only begins at the column coloffset.
1228  *
1229  * \param M Matrix
1230  * \param row Index of row
1231  * \param coloffset Column offset
1232  */
1233 
1234 void mzd_row_clear_offset(mzd_t *M, rci_t const row, rci_t const coloffset);
1235 
1236 /**
1237  * \brief Find the next nonzero entry in M starting at start_row and start_col.
1238  *
1239  * This function walks down rows in the inner loop and columns in the
1240  * outer loop. If a nonzero entry is found this function returns 1 and
1241  * zero otherwise.
1242  *
1243  * If and only if a nonzero entry is found r and c are updated.
1244  *
1245  * \param M Matrix
1246  * \param start_row Index of row where to start search
1247  * \param start_col Index of column where to start search
1248  * \param r Row index updated if pivot is found
1249  * \param c Column index updated if pivot is found
1250  */
1251 
1252 int mzd_find_pivot(mzd_t const *M, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c);
1253 
1254 /**
1255  * \brief Return the number of nonzero entries divided by nrows *
1256  * ncols
1257  *
1258  * If res = 0 then 100 samples per row are made, if res > 0 the
1259  * function takes res sized steps within each row (res = 1 uses every
1260  * word).
1261  *
1262  * \param A Matrix
1263  * \param res Resolution of sampling (in words)
1264  */
1265 
1266 double mzd_density(mzd_t const *A, wi_t res);
1267 
1268 /**
1269  * \brief Return the number of nonzero entries divided by nrows *
1270  * ncols considering only the submatrix starting at (r,c).
1271  *
1272  * If res = 0 then 100 samples per row are made, if res > 0 the
1273  * function takes res sized steps within each row (res = 1 uses every
1274  * word).
1275  *
1276  * \param A Matrix
1277  * \param res Resolution of sampling (in words)
1278  * \param r Row to start counting
1279  * \param c Column to start counting
1280  */
1281 
1282 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c);
1283 
1284 /**
1285  * \brief Return the first row with all zero entries.
1286  *
1287  * If no such row can be found returns nrows.
1288  *
1289  * \param A Matrix
1290  */
1291 
1292 rci_t mzd_first_zero_row(mzd_t const *A);
1293 
1294 /**
1295  * \brief Return hash value for matrix.
1296  *
1297  * \param A Matrix
1298  */
1299 
mzd_hash(mzd_t const * A)1300 static inline word mzd_hash(mzd_t const *A) {
1301   word hash = 0;
1302   for (rci_t r = 0; r < A->nrows; ++r) {
1303     hash ^= rotate_word(calculate_hash(A->rows[r], A->width), r % m4ri_radix);
1304   }
1305   return hash;
1306 }
1307 
1308 /**
1309  * Return upper triangular submatrix of A
1310  *
1311  * \param U Output matrix, if NULL a new matrix will be returned
1312  * \param A Source matrix
1313  *
1314  * \return U
1315  */
1316 
1317 mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A);
1318 
1319 /**
1320  * Return lower triangular submatrix of A
1321  *
1322  * \param L Output matrix, if NULL a new matrix will be returned
1323  * \param A Source matrix
1324  *
1325  * \return L
1326  */
1327 
1328 mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A);
1329 
1330 #endif  // M4RI_MZD
1331