1 #ifndef __BSR_H__
2 #define __BSR_H__
3 
4 #include <vector>
5 #include <algorithm>
6 #include <functional>
7 
8 #include "csr.h"
9 #include "dense.h"
10 
diagonal_size(const npy_intp k,const npy_intp rows,const npy_intp cols)11 static inline npy_intp diagonal_size(const npy_intp k,
12                                      const npy_intp rows,
13                                      const npy_intp cols)
14 {
15     return std::min(rows + std::min(k, (npy_intp)0),
16                     cols - std::max(k, (npy_intp)0));
17 }
18 
19 
20 template <class I, class T>
bsr_diagonal(const I k,const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],T Yx[])21 void bsr_diagonal(const I k,
22                   const I n_brow,
23                   const I n_bcol,
24                   const I R,
25                   const I C,
26                   const I Ap[],
27                   const I Aj[],
28                   const T Ax[],
29                         T Yx[])
30 {
31     const npy_intp RC = R * C;
32     const npy_intp D = diagonal_size(k, (npy_intp)n_brow * R,
33                                         (npy_intp)n_bcol * C);
34     const npy_intp first_row = (k >= 0) ? 0 : -k;
35     /* First and next-to-last brows of the diagonal. */
36     const npy_intp first_brow = first_row / R;
37     const npy_intp last_brow = (first_row + D - 1) / R + 1;
38 
39     for (npy_intp brow = first_brow; brow < last_brow; ++brow) {
40         /* First and next-to-last bcols of the diagonal in this brow. */
41         const npy_intp first_bcol = (brow * R + k) / C;
42         const npy_intp last_bcol = ((brow + 1) * R + k - 1) / C + 1;
43 
44         for (npy_intp jj = Ap[brow]; jj < Ap[brow + 1]; ++jj) {
45             const npy_intp bcol = Aj[jj];
46 
47             if (first_bcol <= bcol && bcol < last_bcol) {
48                 /*
49                  * Compute and extract diagonal of block corresponding to the
50                  * k-th overall diagonal and add it to output in right place.
51                  */
52                 const npy_intp block_k = brow * R + k - bcol * C;
53                 const npy_intp block_D = diagonal_size(block_k, R, C);
54                 const npy_intp block_first_row = (block_k >= 0) ? 0 : -block_k;
55                 const npy_intp Y_idx = brow * R + block_first_row - first_row;
56                 const npy_intp Ax_idx = RC * jj +
57                                         ((block_k >= 0) ? block_k :
58                                                           -C * block_k);
59 
60                 for (npy_intp kk = 0; kk < block_D; ++kk) {
61                     Yx[Y_idx + kk] += Ax[Ax_idx + kk * (C + 1)];
62                 }
63 
64             }
65         }
66     }
67 }
68 
69 /*
70  * Scale the rows of a BSR matrix *in place*
71  *
72  *   A[i,:] *= X[i]
73  *
74  */
75 template <class I, class T>
bsr_scale_rows(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],T Ax[],const T Xx[])76 void bsr_scale_rows(const I n_brow,
77                     const I n_bcol,
78                     const I R,
79                     const I C,
80                     const I Ap[],
81                     const I Aj[],
82                           T Ax[],
83                     const T Xx[])
84 {
85     const npy_intp RC = (npy_intp)R*C;
86 
87     for(I i = 0; i < n_brow; i++){
88         const T * row_scales = Xx + (npy_intp)R*i;
89 
90         for(I jj = Ap[i]; jj < Ap[i+1]; jj++){
91            T * block = Ax + RC*jj;
92 
93             for(I bi = 0; bi < R; bi++){
94                 scal(C, row_scales[bi], block + (npy_intp)C*bi);
95             }
96         }
97     }
98 }
99 
100 /*
101  * Scale the columns of a BSR matrix *in place*
102  *
103  *   A[:,i] *= X[i]
104  *
105  */
106 template <class I, class T>
bsr_scale_columns(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],T Ax[],const T Xx[])107 void bsr_scale_columns(const I n_brow,
108                        const I n_bcol,
109                        const I R,
110                        const I C,
111                        const I Ap[],
112                        const I Aj[],
113                              T Ax[],
114                        const T Xx[])
115 {
116     const I bnnz = Ap[n_brow];
117     const npy_intp RC  = (npy_intp)R*C;
118     for(I i = 0; i < bnnz; i++){
119         const T * scales = Xx + (npy_intp)C*Aj[i] ;
120         T * block = Ax + RC*i;
121 
122         for(I bi = 0; bi < R; bi++){
123             for(I bj = 0; bj < C; bj++){
124                 block[C*bi + bj] *= scales[bj];
125             }
126         }
127 
128     }
129 }
130 
131 
132 
133 /*
134  * Sort the column block indices of a BSR matrix inplace
135  *
136  * Input Arguments:
137  *   I  n_brow        - number of row blocks in A
138  *   I  n_bcol        - number of column blocks in A
139  *   I  R             - rows per block
140  *   I  C             - columns per block
141  *   I  Ap[n_brow+1]  - row pointer
142  *   I  Aj[nblk(A)]   - column indices
143  *   T  Ax[nnz(A)]    - nonzeros
144  *
145  */
146 template <class I, class T>
bsr_sort_indices(const I n_brow,const I n_bcol,const I R,const I C,I Ap[],I Aj[],T Ax[])147 void bsr_sort_indices(const I n_brow,
148                       const I n_bcol,
149                       const I R,
150                       const I C,
151                             I Ap[],
152                             I Aj[],
153                             T Ax[])
154 {
155     if( R == 1 && C == 1 ){
156         csr_sort_indices(n_brow, Ap, Aj, Ax);
157         return;
158     }
159 
160 
161     const I nblks = Ap[n_brow];
162     const npy_intp RC    = (npy_intp)R*C;
163     const npy_intp nnz   = (npy_intp)RC*nblks;
164 
165     //compute permutation of blocks using CSR
166     std::vector<I> perm(nblks);
167 
168     for(I i = 0; i < nblks; i++)
169         perm[i] = i;
170 
171     csr_sort_indices(n_brow, Ap, Aj, &perm[0]);
172 
173     std::vector<T> Ax_copy(nnz);
174     std::copy(Ax, Ax + nnz, Ax_copy.begin());
175 
176     for(I i = 0; i < nblks; i++){
177         const T * input = &Ax_copy[RC * perm[i]];
178               T * output = Ax + RC*i;
179         std::copy(input, input + RC, output);
180     }
181 }
182 
183 
184 /*
185  * Compute transpose(A) BSR matrix A
186  *
187  * Input Arguments:
188  *   I  n_brow        - number of row blocks in A
189  *   I  n_bcol        - number of column blocks in A
190  *   I  R             - rows per block
191  *   I  C             - columns per block
192  *   I  Ap[n_brow+1]  - row pointer
193  *   I  Aj[nblk(A)]   - column indices
194  *   T  Ax[nnz(A)]    - nonzeros
195  *
196  * Output Arguments:
197  *   I  Bp[n_col+1]   - row pointer
198  *   I  Bj[nblk(A)]   - column indices
199  *   T  Bx[nnz(A)]    - nonzeros
200  *
201  * Note:
202  *   Output arrays Bp, Bj, Bx must be preallocated
203  *
204  * Note:
205  *   Input:  column indices *are not* assumed to be in sorted order
206  *   Output: row indices *will be* in sorted order
207  *
208  *   Complexity: Linear.  Specifically O(nnz(A) + max(n_row,n_col))
209  *
210  */
211 template <class I, class T>
bsr_transpose(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],I Bp[],I Bj[],T Bx[])212 void bsr_transpose(const I n_brow,
213                    const I n_bcol,
214                    const I R,
215                    const I C,
216                    const I Ap[],
217                    const I Aj[],
218                    const T Ax[],
219                          I Bp[],
220                          I Bj[],
221                          T Bx[])
222 {
223     const I nblks = Ap[n_brow];
224     const npy_intp RC    = (npy_intp)R*C;
225 
226     //compute permutation of blocks using tranpose(CSR)
227     std::vector<I> perm_in (nblks);
228     std::vector<I> perm_out(nblks);
229 
230     for(I i = 0; i < nblks; i++)
231         perm_in[i] = i;
232 
233     csr_tocsc(n_brow, n_bcol, Ap, Aj, &perm_in[0], Bp, Bj, &perm_out[0]);
234 
235     for(I i = 0; i < nblks; i++){
236         const T * Ax_blk = Ax + RC * perm_out[i];
237               T * Bx_blk = Bx + RC * i;
238         for(I r = 0; r < R; r++){
239             for(I c = 0; c < C; c++){
240                 Bx_blk[(npy_intp)c * R + r] = Ax_blk[(npy_intp)r * C + c];
241             }
242         }
243     }
244 }
245 
246 
247 
248 template <class I, class T>
bsr_matmat(const I maxnnz,const I n_brow,const I n_bcol,const I R,const I C,const I N,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])249 void bsr_matmat(const I maxnnz,
250                 const I n_brow,  const I n_bcol,
251                 const I R,       const I C,       const I N,
252                 const I Ap[],    const I Aj[],    const T Ax[],
253                 const I Bp[],    const I Bj[],    const T Bx[],
254                       I Cp[],          I Cj[],          T Cx[])
255 {
256     assert(R > 0 && C > 0 && N > 0);
257 
258     if( R == 1 && N == 1 && C == 1 ){
259         // Use CSR for 1x1 blocksize
260         csr_matmat(n_brow, n_bcol, Ap, Aj, Ax, Bp, Bj, Bx, Cp, Cj, Cx);
261         return;
262     }
263 
264     const npy_intp RC = (npy_intp)R*C;
265     const npy_intp RN = (npy_intp)R*N;
266     const npy_intp NC = (npy_intp)N*C;
267 
268     std::fill( Cx, Cx + RC * maxnnz, 0 ); //clear output array
269 
270     std::vector<I>  next(n_bcol,-1);
271     std::vector<T*> mats(n_bcol);
272 
273     npy_intp nnz = 0;
274     Cp[0] = 0;
275 
276     for(I i = 0; i < n_brow; i++){
277         I head   = -2;
278         I length =  0;
279 
280         I jj_start = Ap[i];
281         I jj_end   = Ap[i+1];
282         for(I jj = jj_start; jj < jj_end; jj++){
283             I j = Aj[jj];
284 
285             I kk_start = Bp[j];
286             I kk_end   = Bp[j+1];
287             for(I kk = kk_start; kk < kk_end; kk++){
288                 I k = Bj[kk];
289 
290                 if(next[k] == -1){
291                     next[k] = head;
292                     head = k;
293                     Cj[nnz] = k;
294                     mats[k] = Cx + RC*nnz;
295                     nnz++;
296                     length++;
297                 }
298 
299                 const T * A = Ax + jj*RN;
300                 const T * B = Bx + kk*NC;
301 
302                 gemm(R, C, N, A, B, mats[k]);
303             }
304         }
305 
306         for(I jj = 0; jj < length; jj++){
307             I temp = head;
308             head = next[head];
309             next[temp] = -1; //clear arrays
310         }
311 
312         Cp[i+1] = nnz;
313     }
314 }
315 
316 
317 
318 
319 template <class I, class T>
is_nonzero_block(const T block[],const I blocksize)320 bool is_nonzero_block(const T block[], const I blocksize){
321     for(I i = 0; i < blocksize; i++){
322         if(block[i] != 0){
323             return true;
324         }
325     }
326     return false;
327 }
328 
329 
330 
331 /*
332  * Compute C = A (binary_op) B for BSR matrices that are not
333  * necessarily canonical BSR format.  Specifically, this method
334  * works even when the input matrices have duplicate and/or
335  * unsorted column indices within a given row.
336  *
337  * Refer to bsr_binop_bsr() for additional information
338  *
339  * Note:
340  *   Output arrays Cp, Cj, and Cx must be preallocated
341  *   If nnz(C) is not known a priori, a conservative bound is:
342  *          nnz(C) <= nnz(A) + nnz(B)
343  *
344  * Note:
345  *   Input:  A and B column indices are not assumed to be in sorted order
346  *   Output: C column indices are not generally in sorted order
347  *           C will not contain any duplicate entries or explicit zeros.
348  *
349  */
350 template <class I, class T, class T2, class bin_op>
bsr_binop_bsr_general(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[],const bin_op & op)351 void bsr_binop_bsr_general(const I n_brow, const I n_bcol,
352                            const I R,      const I C,
353                            const I Ap[],   const I Aj[],   const T Ax[],
354                            const I Bp[],   const I Bj[],   const T Bx[],
355                                  I Cp[],         I Cj[],        T2 Cx[],
356                            const bin_op& op)
357 {
358     //Method that works for duplicate and/or unsorted indices
359     const npy_intp RC = (npy_intp)R*C;
360 
361     Cp[0] = 0;
362     I nnz = 0;
363 
364     std::vector<I>  next(n_bcol,     -1);
365     std::vector<T> A_row(n_bcol * RC, 0);   // this approach can be problematic for large R
366     std::vector<T> B_row(n_bcol * RC, 0);
367 
368     for(I i = 0; i < n_brow; i++){
369         I head   = -2;
370         I length =  0;
371 
372         //add a row of A to A_row
373         for(I jj = Ap[i]; jj < Ap[i+1]; jj++){
374             I j = Aj[jj];
375 
376             for(I n = 0; n < RC; n++)
377                 A_row[RC*j + n] += Ax[RC*jj + n];
378 
379             if(next[j] == -1){
380                 next[j] = head;
381                 head = j;
382                 length++;
383             }
384         }
385 
386         //add a row of B to B_row
387         for(I jj = Bp[i]; jj < Bp[i+1]; jj++){
388             I j = Bj[jj];
389 
390             for(I n = 0; n < RC; n++)
391                 B_row[RC*j + n] += Bx[RC*jj + n];
392 
393             if(next[j] == -1){
394                 next[j] = head;
395                 head = j;
396                 length++;
397             }
398         }
399 
400 
401         for(I jj = 0; jj < length; jj++){
402             // compute op(block_A, block_B)
403             for(I n = 0; n < RC; n++)
404                 Cx[RC * nnz + n] = op(A_row[RC*head + n], B_row[RC*head + n]);
405 
406             // advance counter if block is nonzero
407             if( is_nonzero_block(Cx + (RC * nnz), RC) )
408                 Cj[nnz++] = head;
409 
410             // clear block_A and block_B values
411             for(I n = 0; n < RC; n++){
412                 A_row[RC*head + n] = 0;
413                 B_row[RC*head + n] = 0;
414             }
415 
416             I temp = head;
417             head = next[head];
418             next[temp] = -1;
419         }
420 
421         Cp[i + 1] = nnz;
422     }
423 }
424 
425 
426 /*
427  * Compute C = A (binary_op) B for BSR matrices that are in the
428  * canonical BSR format.  Specifically, this method requires that
429  * the rows of the input matrices are free of duplicate column indices
430  * and that the column indices are in sorted order.
431  *
432  * Refer to bsr_binop_bsr() for additional information
433  *
434  * Note:
435  *   Input:  A and B column indices are assumed to be in sorted order
436  *   Output: C column indices will be in sorted order
437  *           Cx will not contain any zero entries
438  *
439  */
440 template <class I, class T, class T2, class bin_op>
bsr_binop_bsr_canonical(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[],const bin_op & op)441 void bsr_binop_bsr_canonical(const I n_brow, const I n_bcol,
442                              const I R,      const I C,
443                              const I Ap[],   const I Aj[],   const T Ax[],
444                              const I Bp[],   const I Bj[],   const T Bx[],
445                                    I Cp[],         I Cj[],        T2 Cx[],
446                              const bin_op& op)
447 {
448     const npy_intp RC = (npy_intp)R*C;
449     T2 * result = Cx;
450 
451     Cp[0] = 0;
452     I nnz = 0;
453 
454     for(I i = 0; i < n_brow; i++){
455         I A_pos = Ap[i];
456         I B_pos = Bp[i];
457         I A_end = Ap[i+1];
458         I B_end = Bp[i+1];
459 
460         //while not finished with either row
461         while(A_pos < A_end && B_pos < B_end){
462             I A_j = Aj[A_pos];
463             I B_j = Bj[B_pos];
464 
465             if(A_j == B_j){
466                 for(I n = 0; n < RC; n++){
467                     result[n] = op(Ax[RC*A_pos + n], Bx[RC*B_pos + n]);
468                 }
469 
470                 if( is_nonzero_block(result,RC) ){
471                     Cj[nnz] = A_j;
472                     result += RC;
473                     nnz++;
474                 }
475 
476                 A_pos++;
477                 B_pos++;
478             } else if (A_j < B_j) {
479                 for(I n = 0; n < RC; n++){
480                     result[n] = op(Ax[RC*A_pos + n], 0);
481                 }
482 
483                 if(is_nonzero_block(result,RC)){
484                     Cj[nnz] = A_j;
485                     result += RC;
486                     nnz++;
487                 }
488 
489                 A_pos++;
490             } else {
491                 //B_j < A_j
492                 for(I n = 0; n < RC; n++){
493                     result[n] = op(0, Bx[RC*B_pos + n]);
494                 }
495                 if(is_nonzero_block(result,RC)){
496                     Cj[nnz] = B_j;
497                     result += RC;
498                     nnz++;
499                 }
500 
501                 B_pos++;
502             }
503         }
504 
505         //tail
506         while(A_pos < A_end){
507             for(I n = 0; n < RC; n++){
508                 result[n] = op(Ax[RC*A_pos + n], 0);
509             }
510 
511             if(is_nonzero_block(result, RC)){
512                 Cj[nnz] = Aj[A_pos];
513                 result += RC;
514                 nnz++;
515             }
516 
517             A_pos++;
518         }
519         while(B_pos < B_end){
520             for(I n = 0; n < RC; n++){
521                 result[n] = op(0,Bx[RC*B_pos + n]);
522             }
523 
524             if(is_nonzero_block(result, RC)){
525                 Cj[nnz] = Bj[B_pos];
526                 result += RC;
527                 nnz++;
528             }
529 
530             B_pos++;
531         }
532 
533         Cp[i+1] = nnz;
534     }
535 }
536 
537 
538 /*
539  * Compute C = A (binary_op) B for CSR matrices A,B where the column
540  * indices with the rows of A and B are known to be sorted.
541  *
542  *   binary_op(x,y) - binary operator to apply elementwise
543  *
544  * Input Arguments:
545  *   I    n_row       - number of rows in A (and B)
546  *   I    n_col       - number of columns in A (and B)
547  *   I    Ap[n_row+1] - row pointer
548  *   I    Aj[nnz(A)]  - column indices
549  *   T    Ax[nnz(A)]  - nonzeros
550  *   I    Bp[n_row+1] - row pointer
551  *   I    Bj[nnz(B)]  - column indices
552  *   T    Bx[nnz(B)]  - nonzeros
553  * Output Arguments:
554  *   I    Cp[n_row+1] - row pointer
555  *   I    Cj[nnz(C)]  - column indices
556  *   T    Cx[nnz(C)]  - nonzeros
557  *
558  * Note:
559  *   Output arrays Cp, Cj, and Cx must be preallocated
560  *   If nnz(C) is not known a priori, a conservative bound is:
561  *          nnz(C) <= nnz(A) + nnz(B)
562  *
563  * Note:
564  *   Input:  A and B column indices are not assumed to be in sorted order.
565  *   Output: C column indices will be in sorted if both A and B have sorted indices.
566  *           Cx will not contain any zero entries
567  *
568  */
569 template <class I, class T, class T2, class bin_op>
bsr_binop_bsr(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[],const bin_op & op)570 void bsr_binop_bsr(const I n_brow, const I n_bcol,
571                    const I R,      const I C,
572                    const I Ap[],   const I Aj[],   const T Ax[],
573                    const I Bp[],   const I Bj[],   const T Bx[],
574                          I Cp[],         I Cj[],        T2 Cx[],
575                    const bin_op& op)
576 {
577     assert( R > 0 && C > 0);
578 
579     if( R == 1 && C == 1 ){
580         //use CSR for 1x1 blocksize
581         csr_binop_csr(n_brow, n_bcol, Ap, Aj, Ax, Bp, Bj, Bx, Cp, Cj, Cx, op);
582     }
583     else if ( csr_has_canonical_format(n_brow, Ap, Aj) && csr_has_canonical_format(n_brow, Bp, Bj) ){
584         // prefer faster implementation
585         bsr_binop_bsr_canonical(n_brow, n_bcol, R, C, Ap, Aj, Ax, Bp, Bj, Bx, Cp, Cj, Cx, op);
586     }
587     else {
588         // slower fallback method
589         bsr_binop_bsr_general(n_brow, n_bcol, R, C, Ap, Aj, Ax, Bp, Bj, Bx, Cp, Cj, Cx, op);
590     }
591 }
592 
593 /* element-wise binary operations */
594 template <class I, class T, class T2>
bsr_ne_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])595 void bsr_ne_bsr(const I n_row, const I n_col, const I R, const I C,
596                 const I Ap[], const I Aj[], const T Ax[],
597                 const I Bp[], const I Bj[], const T Bx[],
598                       I Cp[],       I Cj[],      T2 Cx[])
599 {
600     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::not_equal_to<T>());
601 }
602 
603 template <class I, class T, class T2>
bsr_lt_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])604 void bsr_lt_bsr(const I n_row, const I n_col, const I R, const I C,
605                 const I Ap[], const I Aj[], const T Ax[],
606                 const I Bp[], const I Bj[], const T Bx[],
607                       I Cp[],       I Cj[],      T2 Cx[])
608 {
609     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::less<T>());
610 }
611 
612 template <class I, class T, class T2>
bsr_gt_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])613 void bsr_gt_bsr(const I n_row, const I n_col, const I R, const I C,
614                 const I Ap[], const I Aj[], const T Ax[],
615                 const I Bp[], const I Bj[], const T Bx[],
616                       I Cp[],       I Cj[],      T2 Cx[])
617 {
618     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::greater<T>());
619 }
620 
621 template <class I, class T, class T2>
bsr_le_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])622 void bsr_le_bsr(const I n_row, const I n_col, const I R, const I C,
623                 const I Ap[], const I Aj[], const T Ax[],
624                 const I Bp[], const I Bj[], const T Bx[],
625                       I Cp[],       I Cj[],      T2 Cx[])
626 {
627     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::less_equal<T>());
628 }
629 
630 template <class I, class T, class T2>
bsr_ge_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])631 void bsr_ge_bsr(const I n_row, const I n_col, const I R, const I C,
632                 const I Ap[], const I Aj[], const T Ax[],
633                 const I Bp[], const I Bj[], const T Bx[],
634                       I Cp[],       I Cj[],      T2 Cx[])
635 {
636     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::greater_equal<T>());
637 }
638 
639 template <class I, class T>
bsr_elmul_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])640 void bsr_elmul_bsr(const I n_row, const I n_col, const I R, const I C,
641                    const I Ap[], const I Aj[], const T Ax[],
642                    const I Bp[], const I Bj[], const T Bx[],
643                          I Cp[],       I Cj[],       T Cx[])
644 {
645     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::multiplies<T>());
646 }
647 
648 template <class I, class T>
bsr_eldiv_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])649 void bsr_eldiv_bsr(const I n_row, const I n_col, const I R, const I C,
650                    const I Ap[], const I Aj[], const T Ax[],
651                    const I Bp[], const I Bj[], const T Bx[],
652                          I Cp[],       I Cj[],       T Cx[])
653 {
654     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::divides<T>());
655 }
656 
657 
658 template <class I, class T>
bsr_plus_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])659 void bsr_plus_bsr(const I n_row, const I n_col, const I R, const I C,
660                   const I Ap[], const I Aj[], const T Ax[],
661                   const I Bp[], const I Bj[], const T Bx[],
662                         I Cp[],       I Cj[],       T Cx[])
663 {
664     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::plus<T>());
665 }
666 
667 template <class I, class T>
bsr_minus_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])668 void bsr_minus_bsr(const I n_row, const I n_col, const I R, const I C,
669                    const I Ap[], const I Aj[], const T Ax[],
670                    const I Bp[], const I Bj[], const T Bx[],
671                          I Cp[],       I Cj[],       T Cx[])
672 {
673     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::minus<T>());
674 }
675 
676 
677 template <class I, class T>
bsr_maximum_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])678 void bsr_maximum_bsr(const I n_row, const I n_col, const I R, const I C,
679                      const I Ap[], const I Aj[], const T Ax[],
680                      const I Bp[], const I Bj[], const T Bx[],
681                            I Cp[],       I Cj[],       T Cx[])
682 {
683     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,maximum<T>());
684 }
685 
686 template <class I, class T>
bsr_minimum_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])687 void bsr_minimum_bsr(const I n_row, const I n_col, const I R, const I C,
688                      const I Ap[], const I Aj[], const T Ax[],
689                      const I Bp[], const I Bj[], const T Bx[],
690                            I Cp[],       I Cj[],       T Cx[])
691 {
692     bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,minimum<T>());
693 }
694 
695 
696 /*
697  * Compute B = A for BSR matrix A, CSR matrix B
698  *
699  * Input Arguments:
700  *   I  n_brow          - number of block rows in A
701  *   I  n_bcol          - number of block columns in A
702  *   I  R               - row blocksize
703  *   I  C               - column blocksize
704  *   I  Ap[n_brow+1]    - block row pointer
705  *   I  Aj[nnz(A)]      - block column indices
706  *   T  Ax[nnz(A)]      - nonzero blocks
707  *
708  * Output Arguments:
709  *   I  Bp[n_brow*R + 1]- row pointer
710  *   I  Bj[nnz(B)]      - column indices
711  *   T  Bx[nnz(B)]      - nonzero values
712  *
713  * Note:
714  *   Complexity: Linear. Specifically O(nnz(A) + max(n_row,n_col))
715  *   Output arrays must be preallocated
716  *
717  * Note:
718  *   Input:  column indices *are not* assumed to be in sorted order or unique
719  *   Output: the block column (unsorted) orders, duplicates,
720  *           and explicit zeros are preserved
721  *
722  */
723 template <class I, class T>
bsr_tocsr(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],I Bp[],I Bj[],T Bx[])724 void bsr_tocsr(const I n_brow, const I n_bcol, const I R, const I C,
725                const I Ap[], const I Aj[], const T Ax[],
726                      I Bp[],       I Bj[],       T Bx[])
727 {
728     // number of elements per block
729     const I RC = R*C;
730     // nnz
731     const I nnz = Ap[n_brow] * RC;
732     // last element in Bp is always nnz
733     Bp[n_brow * R] = nnz;
734     // loop for block row
735     for(I brow = 0; brow < n_brow; brow++){
736         // size of block rows
737         const I brow_size = Ap[brow + 1] - Ap[brow];
738         // size of row in csr
739         const I row_size = C * brow_size;
740         // loop of rows inside block
741         for(I r = 0; r < R; r++){
742             // csr row number
743             const I row = R * brow + r;
744             Bp[row] = RC * Ap[brow] + r * row_size;
745             // loop for block column
746             // block index inside row as loop variable
747             for (I bjj = 0; bjj < brow_size; bjj++)
748             {
749                 const I b_ind = Ap[brow] + bjj;
750                 // block column number
751                 const I bcol = Aj[b_ind];
752                 // loop for columns inside block
753                 for (I c = 0; c < C; c++)
754                 {
755                     // bsr data index in Ax
756                     // Ax is in C order
757                     const I b_data_ind = RC * b_ind + C * r + c;
758                     // csr column number
759                     const I col = C * bcol + c;
760                     // csr data anc col index in Bj and Bx
761                     // start from Bp[row], offset by current bjj*C and c
762                     const I data_ind = Bp[row] + bjj * C + c;
763                     // assign col and data to Bj and Bx
764                     Bj[data_ind] = col;
765                     Bx[data_ind] = Ax[b_data_ind];
766                 }
767             }
768         }
769     }
770 }
771 
772 
773 template <class I, class T>
bsr_matvec(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const T Xx[],T Yx[])774 void bsr_matvec(const I n_brow,
775                 const I n_bcol,
776                 const I R,
777                 const I C,
778                 const I Ap[],
779                 const I Aj[],
780                 const T Ax[],
781                 const T Xx[],
782                       T Yx[])
783 {
784     assert(R > 0 && C > 0);
785 
786     if( R == 1 && C == 1 ){
787         //use CSR for 1x1 blocksize
788         csr_matvec(n_brow, n_bcol, Ap, Aj, Ax, Xx, Yx);
789         return;
790     }
791 
792     const npy_intp RC = (npy_intp)R*C;
793     for(I i = 0; i < n_brow; i++){
794         T * y = Yx + (npy_intp)R * i;
795         for(I jj = Ap[i]; jj < Ap[i+1]; jj++){
796             const I j = Aj[jj];
797             const T * A = Ax + RC * jj;
798             const T * x = Xx + (npy_intp)C * j;
799             gemv(R, C, A, x, y); // y += A*x
800         }
801     }
802 }
803 
804 
805 /*
806  * Compute Y += A*X for BSR matrix A and dense block vectors X,Y
807  *
808  *
809  * Input Arguments:
810  *   I  n_brow              - number of row blocks in A
811  *   I  n_bcol              - number of column blocks in A
812  *   I  n_vecs              - number of column vectors in X and Y
813  *   I  R                   - rows per block
814  *   I  C                   - columns per block
815  *   I  Ap[n_brow+1]        - row pointer
816  *   I  Aj[nblks(A)]        - column indices
817  *   T  Ax[nnz(A)]          - nonzeros
818  *   T  Xx[C*n_bcol,n_vecs] - input vector
819  *
820  * Output Arguments:
821  *   T  Yx[R*n_brow,n_vecs] - output vector
822  *
823  */
824 template <class I, class T>
bsr_matvecs(const I n_brow,const I n_bcol,const I n_vecs,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const T Xx[],T Yx[])825 void bsr_matvecs(const I n_brow,
826                  const I n_bcol,
827                  const I n_vecs,
828                  const I R,
829                  const I C,
830                  const I Ap[],
831                  const I Aj[],
832                  const T Ax[],
833                  const T Xx[],
834                        T Yx[])
835 {
836     assert(R > 0 && C > 0);
837 
838     if( R == 1 && C == 1 ){
839         //use CSR for 1x1 blocksize
840         csr_matvecs(n_brow, n_bcol, n_vecs, Ap, Aj, Ax, Xx, Yx);
841         return;
842     }
843 
844     const npy_intp A_bs = (npy_intp)R*C;      //Ax blocksize
845     const npy_intp Y_bs = (npy_intp)n_vecs*R; //Yx blocksize
846     const npy_intp X_bs = (npy_intp)C*n_vecs; //Xx blocksize
847 
848     for(I i = 0; i < n_brow; i++){
849         T * y = Yx + Y_bs * i;
850         for(I jj = Ap[i]; jj < Ap[i+1]; jj++){
851             const I j = Aj[jj];
852             const T * A = Ax + A_bs * jj;
853             const T * x = Xx + X_bs * j;
854             gemm(R, n_vecs, C, A, x, y); // y += A*x
855         }
856     }
857 }
858 
859 
860 #endif
861