1 #ifndef __BSR_H__
2 #define __BSR_H__
3
4 #include <vector>
5 #include <algorithm>
6 #include <functional>
7
8 #include "csr.h"
9 #include "dense.h"
10
diagonal_size(const npy_intp k,const npy_intp rows,const npy_intp cols)11 static inline npy_intp diagonal_size(const npy_intp k,
12 const npy_intp rows,
13 const npy_intp cols)
14 {
15 return std::min(rows + std::min(k, (npy_intp)0),
16 cols - std::max(k, (npy_intp)0));
17 }
18
19
20 template <class I, class T>
bsr_diagonal(const I k,const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],T Yx[])21 void bsr_diagonal(const I k,
22 const I n_brow,
23 const I n_bcol,
24 const I R,
25 const I C,
26 const I Ap[],
27 const I Aj[],
28 const T Ax[],
29 T Yx[])
30 {
31 const npy_intp RC = R * C;
32 const npy_intp D = diagonal_size(k, (npy_intp)n_brow * R,
33 (npy_intp)n_bcol * C);
34 const npy_intp first_row = (k >= 0) ? 0 : -k;
35 /* First and next-to-last brows of the diagonal. */
36 const npy_intp first_brow = first_row / R;
37 const npy_intp last_brow = (first_row + D - 1) / R + 1;
38
39 for (npy_intp brow = first_brow; brow < last_brow; ++brow) {
40 /* First and next-to-last bcols of the diagonal in this brow. */
41 const npy_intp first_bcol = (brow * R + k) / C;
42 const npy_intp last_bcol = ((brow + 1) * R + k - 1) / C + 1;
43
44 for (npy_intp jj = Ap[brow]; jj < Ap[brow + 1]; ++jj) {
45 const npy_intp bcol = Aj[jj];
46
47 if (first_bcol <= bcol && bcol < last_bcol) {
48 /*
49 * Compute and extract diagonal of block corresponding to the
50 * k-th overall diagonal and add it to output in right place.
51 */
52 const npy_intp block_k = brow * R + k - bcol * C;
53 const npy_intp block_D = diagonal_size(block_k, R, C);
54 const npy_intp block_first_row = (block_k >= 0) ? 0 : -block_k;
55 const npy_intp Y_idx = brow * R + block_first_row - first_row;
56 const npy_intp Ax_idx = RC * jj +
57 ((block_k >= 0) ? block_k :
58 -C * block_k);
59
60 for (npy_intp kk = 0; kk < block_D; ++kk) {
61 Yx[Y_idx + kk] += Ax[Ax_idx + kk * (C + 1)];
62 }
63
64 }
65 }
66 }
67 }
68
69 /*
70 * Scale the rows of a BSR matrix *in place*
71 *
72 * A[i,:] *= X[i]
73 *
74 */
75 template <class I, class T>
bsr_scale_rows(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],T Ax[],const T Xx[])76 void bsr_scale_rows(const I n_brow,
77 const I n_bcol,
78 const I R,
79 const I C,
80 const I Ap[],
81 const I Aj[],
82 T Ax[],
83 const T Xx[])
84 {
85 const npy_intp RC = (npy_intp)R*C;
86
87 for(I i = 0; i < n_brow; i++){
88 const T * row_scales = Xx + (npy_intp)R*i;
89
90 for(I jj = Ap[i]; jj < Ap[i+1]; jj++){
91 T * block = Ax + RC*jj;
92
93 for(I bi = 0; bi < R; bi++){
94 scal(C, row_scales[bi], block + (npy_intp)C*bi);
95 }
96 }
97 }
98 }
99
100 /*
101 * Scale the columns of a BSR matrix *in place*
102 *
103 * A[:,i] *= X[i]
104 *
105 */
106 template <class I, class T>
bsr_scale_columns(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],T Ax[],const T Xx[])107 void bsr_scale_columns(const I n_brow,
108 const I n_bcol,
109 const I R,
110 const I C,
111 const I Ap[],
112 const I Aj[],
113 T Ax[],
114 const T Xx[])
115 {
116 const I bnnz = Ap[n_brow];
117 const npy_intp RC = (npy_intp)R*C;
118 for(I i = 0; i < bnnz; i++){
119 const T * scales = Xx + (npy_intp)C*Aj[i] ;
120 T * block = Ax + RC*i;
121
122 for(I bi = 0; bi < R; bi++){
123 for(I bj = 0; bj < C; bj++){
124 block[C*bi + bj] *= scales[bj];
125 }
126 }
127
128 }
129 }
130
131
132
133 /*
134 * Sort the column block indices of a BSR matrix inplace
135 *
136 * Input Arguments:
137 * I n_brow - number of row blocks in A
138 * I n_bcol - number of column blocks in A
139 * I R - rows per block
140 * I C - columns per block
141 * I Ap[n_brow+1] - row pointer
142 * I Aj[nblk(A)] - column indices
143 * T Ax[nnz(A)] - nonzeros
144 *
145 */
146 template <class I, class T>
bsr_sort_indices(const I n_brow,const I n_bcol,const I R,const I C,I Ap[],I Aj[],T Ax[])147 void bsr_sort_indices(const I n_brow,
148 const I n_bcol,
149 const I R,
150 const I C,
151 I Ap[],
152 I Aj[],
153 T Ax[])
154 {
155 if( R == 1 && C == 1 ){
156 csr_sort_indices(n_brow, Ap, Aj, Ax);
157 return;
158 }
159
160
161 const I nblks = Ap[n_brow];
162 const npy_intp RC = (npy_intp)R*C;
163 const npy_intp nnz = (npy_intp)RC*nblks;
164
165 //compute permutation of blocks using CSR
166 std::vector<I> perm(nblks);
167
168 for(I i = 0; i < nblks; i++)
169 perm[i] = i;
170
171 csr_sort_indices(n_brow, Ap, Aj, &perm[0]);
172
173 std::vector<T> Ax_copy(nnz);
174 std::copy(Ax, Ax + nnz, Ax_copy.begin());
175
176 for(I i = 0; i < nblks; i++){
177 const T * input = &Ax_copy[RC * perm[i]];
178 T * output = Ax + RC*i;
179 std::copy(input, input + RC, output);
180 }
181 }
182
183
184 /*
185 * Compute transpose(A) BSR matrix A
186 *
187 * Input Arguments:
188 * I n_brow - number of row blocks in A
189 * I n_bcol - number of column blocks in A
190 * I R - rows per block
191 * I C - columns per block
192 * I Ap[n_brow+1] - row pointer
193 * I Aj[nblk(A)] - column indices
194 * T Ax[nnz(A)] - nonzeros
195 *
196 * Output Arguments:
197 * I Bp[n_col+1] - row pointer
198 * I Bj[nblk(A)] - column indices
199 * T Bx[nnz(A)] - nonzeros
200 *
201 * Note:
202 * Output arrays Bp, Bj, Bx must be preallocated
203 *
204 * Note:
205 * Input: column indices *are not* assumed to be in sorted order
206 * Output: row indices *will be* in sorted order
207 *
208 * Complexity: Linear. Specifically O(nnz(A) + max(n_row,n_col))
209 *
210 */
211 template <class I, class T>
bsr_transpose(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],I Bp[],I Bj[],T Bx[])212 void bsr_transpose(const I n_brow,
213 const I n_bcol,
214 const I R,
215 const I C,
216 const I Ap[],
217 const I Aj[],
218 const T Ax[],
219 I Bp[],
220 I Bj[],
221 T Bx[])
222 {
223 const I nblks = Ap[n_brow];
224 const npy_intp RC = (npy_intp)R*C;
225
226 //compute permutation of blocks using tranpose(CSR)
227 std::vector<I> perm_in (nblks);
228 std::vector<I> perm_out(nblks);
229
230 for(I i = 0; i < nblks; i++)
231 perm_in[i] = i;
232
233 csr_tocsc(n_brow, n_bcol, Ap, Aj, &perm_in[0], Bp, Bj, &perm_out[0]);
234
235 for(I i = 0; i < nblks; i++){
236 const T * Ax_blk = Ax + RC * perm_out[i];
237 T * Bx_blk = Bx + RC * i;
238 for(I r = 0; r < R; r++){
239 for(I c = 0; c < C; c++){
240 Bx_blk[(npy_intp)c * R + r] = Ax_blk[(npy_intp)r * C + c];
241 }
242 }
243 }
244 }
245
246
247
248 template <class I, class T>
bsr_matmat(const I maxnnz,const I n_brow,const I n_bcol,const I R,const I C,const I N,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])249 void bsr_matmat(const I maxnnz,
250 const I n_brow, const I n_bcol,
251 const I R, const I C, const I N,
252 const I Ap[], const I Aj[], const T Ax[],
253 const I Bp[], const I Bj[], const T Bx[],
254 I Cp[], I Cj[], T Cx[])
255 {
256 assert(R > 0 && C > 0 && N > 0);
257
258 if( R == 1 && N == 1 && C == 1 ){
259 // Use CSR for 1x1 blocksize
260 csr_matmat(n_brow, n_bcol, Ap, Aj, Ax, Bp, Bj, Bx, Cp, Cj, Cx);
261 return;
262 }
263
264 const npy_intp RC = (npy_intp)R*C;
265 const npy_intp RN = (npy_intp)R*N;
266 const npy_intp NC = (npy_intp)N*C;
267
268 std::fill( Cx, Cx + RC * maxnnz, 0 ); //clear output array
269
270 std::vector<I> next(n_bcol,-1);
271 std::vector<T*> mats(n_bcol);
272
273 npy_intp nnz = 0;
274 Cp[0] = 0;
275
276 for(I i = 0; i < n_brow; i++){
277 I head = -2;
278 I length = 0;
279
280 I jj_start = Ap[i];
281 I jj_end = Ap[i+1];
282 for(I jj = jj_start; jj < jj_end; jj++){
283 I j = Aj[jj];
284
285 I kk_start = Bp[j];
286 I kk_end = Bp[j+1];
287 for(I kk = kk_start; kk < kk_end; kk++){
288 I k = Bj[kk];
289
290 if(next[k] == -1){
291 next[k] = head;
292 head = k;
293 Cj[nnz] = k;
294 mats[k] = Cx + RC*nnz;
295 nnz++;
296 length++;
297 }
298
299 const T * A = Ax + jj*RN;
300 const T * B = Bx + kk*NC;
301
302 gemm(R, C, N, A, B, mats[k]);
303 }
304 }
305
306 for(I jj = 0; jj < length; jj++){
307 I temp = head;
308 head = next[head];
309 next[temp] = -1; //clear arrays
310 }
311
312 Cp[i+1] = nnz;
313 }
314 }
315
316
317
318
319 template <class I, class T>
is_nonzero_block(const T block[],const I blocksize)320 bool is_nonzero_block(const T block[], const I blocksize){
321 for(I i = 0; i < blocksize; i++){
322 if(block[i] != 0){
323 return true;
324 }
325 }
326 return false;
327 }
328
329
330
331 /*
332 * Compute C = A (binary_op) B for BSR matrices that are not
333 * necessarily canonical BSR format. Specifically, this method
334 * works even when the input matrices have duplicate and/or
335 * unsorted column indices within a given row.
336 *
337 * Refer to bsr_binop_bsr() for additional information
338 *
339 * Note:
340 * Output arrays Cp, Cj, and Cx must be preallocated
341 * If nnz(C) is not known a priori, a conservative bound is:
342 * nnz(C) <= nnz(A) + nnz(B)
343 *
344 * Note:
345 * Input: A and B column indices are not assumed to be in sorted order
346 * Output: C column indices are not generally in sorted order
347 * C will not contain any duplicate entries or explicit zeros.
348 *
349 */
350 template <class I, class T, class T2, class bin_op>
bsr_binop_bsr_general(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[],const bin_op & op)351 void bsr_binop_bsr_general(const I n_brow, const I n_bcol,
352 const I R, const I C,
353 const I Ap[], const I Aj[], const T Ax[],
354 const I Bp[], const I Bj[], const T Bx[],
355 I Cp[], I Cj[], T2 Cx[],
356 const bin_op& op)
357 {
358 //Method that works for duplicate and/or unsorted indices
359 const npy_intp RC = (npy_intp)R*C;
360
361 Cp[0] = 0;
362 I nnz = 0;
363
364 std::vector<I> next(n_bcol, -1);
365 std::vector<T> A_row(n_bcol * RC, 0); // this approach can be problematic for large R
366 std::vector<T> B_row(n_bcol * RC, 0);
367
368 for(I i = 0; i < n_brow; i++){
369 I head = -2;
370 I length = 0;
371
372 //add a row of A to A_row
373 for(I jj = Ap[i]; jj < Ap[i+1]; jj++){
374 I j = Aj[jj];
375
376 for(I n = 0; n < RC; n++)
377 A_row[RC*j + n] += Ax[RC*jj + n];
378
379 if(next[j] == -1){
380 next[j] = head;
381 head = j;
382 length++;
383 }
384 }
385
386 //add a row of B to B_row
387 for(I jj = Bp[i]; jj < Bp[i+1]; jj++){
388 I j = Bj[jj];
389
390 for(I n = 0; n < RC; n++)
391 B_row[RC*j + n] += Bx[RC*jj + n];
392
393 if(next[j] == -1){
394 next[j] = head;
395 head = j;
396 length++;
397 }
398 }
399
400
401 for(I jj = 0; jj < length; jj++){
402 // compute op(block_A, block_B)
403 for(I n = 0; n < RC; n++)
404 Cx[RC * nnz + n] = op(A_row[RC*head + n], B_row[RC*head + n]);
405
406 // advance counter if block is nonzero
407 if( is_nonzero_block(Cx + (RC * nnz), RC) )
408 Cj[nnz++] = head;
409
410 // clear block_A and block_B values
411 for(I n = 0; n < RC; n++){
412 A_row[RC*head + n] = 0;
413 B_row[RC*head + n] = 0;
414 }
415
416 I temp = head;
417 head = next[head];
418 next[temp] = -1;
419 }
420
421 Cp[i + 1] = nnz;
422 }
423 }
424
425
426 /*
427 * Compute C = A (binary_op) B for BSR matrices that are in the
428 * canonical BSR format. Specifically, this method requires that
429 * the rows of the input matrices are free of duplicate column indices
430 * and that the column indices are in sorted order.
431 *
432 * Refer to bsr_binop_bsr() for additional information
433 *
434 * Note:
435 * Input: A and B column indices are assumed to be in sorted order
436 * Output: C column indices will be in sorted order
437 * Cx will not contain any zero entries
438 *
439 */
440 template <class I, class T, class T2, class bin_op>
bsr_binop_bsr_canonical(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[],const bin_op & op)441 void bsr_binop_bsr_canonical(const I n_brow, const I n_bcol,
442 const I R, const I C,
443 const I Ap[], const I Aj[], const T Ax[],
444 const I Bp[], const I Bj[], const T Bx[],
445 I Cp[], I Cj[], T2 Cx[],
446 const bin_op& op)
447 {
448 const npy_intp RC = (npy_intp)R*C;
449 T2 * result = Cx;
450
451 Cp[0] = 0;
452 I nnz = 0;
453
454 for(I i = 0; i < n_brow; i++){
455 I A_pos = Ap[i];
456 I B_pos = Bp[i];
457 I A_end = Ap[i+1];
458 I B_end = Bp[i+1];
459
460 //while not finished with either row
461 while(A_pos < A_end && B_pos < B_end){
462 I A_j = Aj[A_pos];
463 I B_j = Bj[B_pos];
464
465 if(A_j == B_j){
466 for(I n = 0; n < RC; n++){
467 result[n] = op(Ax[RC*A_pos + n], Bx[RC*B_pos + n]);
468 }
469
470 if( is_nonzero_block(result,RC) ){
471 Cj[nnz] = A_j;
472 result += RC;
473 nnz++;
474 }
475
476 A_pos++;
477 B_pos++;
478 } else if (A_j < B_j) {
479 for(I n = 0; n < RC; n++){
480 result[n] = op(Ax[RC*A_pos + n], 0);
481 }
482
483 if(is_nonzero_block(result,RC)){
484 Cj[nnz] = A_j;
485 result += RC;
486 nnz++;
487 }
488
489 A_pos++;
490 } else {
491 //B_j < A_j
492 for(I n = 0; n < RC; n++){
493 result[n] = op(0, Bx[RC*B_pos + n]);
494 }
495 if(is_nonzero_block(result,RC)){
496 Cj[nnz] = B_j;
497 result += RC;
498 nnz++;
499 }
500
501 B_pos++;
502 }
503 }
504
505 //tail
506 while(A_pos < A_end){
507 for(I n = 0; n < RC; n++){
508 result[n] = op(Ax[RC*A_pos + n], 0);
509 }
510
511 if(is_nonzero_block(result, RC)){
512 Cj[nnz] = Aj[A_pos];
513 result += RC;
514 nnz++;
515 }
516
517 A_pos++;
518 }
519 while(B_pos < B_end){
520 for(I n = 0; n < RC; n++){
521 result[n] = op(0,Bx[RC*B_pos + n]);
522 }
523
524 if(is_nonzero_block(result, RC)){
525 Cj[nnz] = Bj[B_pos];
526 result += RC;
527 nnz++;
528 }
529
530 B_pos++;
531 }
532
533 Cp[i+1] = nnz;
534 }
535 }
536
537
538 /*
539 * Compute C = A (binary_op) B for CSR matrices A,B where the column
540 * indices with the rows of A and B are known to be sorted.
541 *
542 * binary_op(x,y) - binary operator to apply elementwise
543 *
544 * Input Arguments:
545 * I n_row - number of rows in A (and B)
546 * I n_col - number of columns in A (and B)
547 * I Ap[n_row+1] - row pointer
548 * I Aj[nnz(A)] - column indices
549 * T Ax[nnz(A)] - nonzeros
550 * I Bp[n_row+1] - row pointer
551 * I Bj[nnz(B)] - column indices
552 * T Bx[nnz(B)] - nonzeros
553 * Output Arguments:
554 * I Cp[n_row+1] - row pointer
555 * I Cj[nnz(C)] - column indices
556 * T Cx[nnz(C)] - nonzeros
557 *
558 * Note:
559 * Output arrays Cp, Cj, and Cx must be preallocated
560 * If nnz(C) is not known a priori, a conservative bound is:
561 * nnz(C) <= nnz(A) + nnz(B)
562 *
563 * Note:
564 * Input: A and B column indices are not assumed to be in sorted order.
565 * Output: C column indices will be in sorted if both A and B have sorted indices.
566 * Cx will not contain any zero entries
567 *
568 */
569 template <class I, class T, class T2, class bin_op>
bsr_binop_bsr(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[],const bin_op & op)570 void bsr_binop_bsr(const I n_brow, const I n_bcol,
571 const I R, const I C,
572 const I Ap[], const I Aj[], const T Ax[],
573 const I Bp[], const I Bj[], const T Bx[],
574 I Cp[], I Cj[], T2 Cx[],
575 const bin_op& op)
576 {
577 assert( R > 0 && C > 0);
578
579 if( R == 1 && C == 1 ){
580 //use CSR for 1x1 blocksize
581 csr_binop_csr(n_brow, n_bcol, Ap, Aj, Ax, Bp, Bj, Bx, Cp, Cj, Cx, op);
582 }
583 else if ( csr_has_canonical_format(n_brow, Ap, Aj) && csr_has_canonical_format(n_brow, Bp, Bj) ){
584 // prefer faster implementation
585 bsr_binop_bsr_canonical(n_brow, n_bcol, R, C, Ap, Aj, Ax, Bp, Bj, Bx, Cp, Cj, Cx, op);
586 }
587 else {
588 // slower fallback method
589 bsr_binop_bsr_general(n_brow, n_bcol, R, C, Ap, Aj, Ax, Bp, Bj, Bx, Cp, Cj, Cx, op);
590 }
591 }
592
593 /* element-wise binary operations */
594 template <class I, class T, class T2>
bsr_ne_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])595 void bsr_ne_bsr(const I n_row, const I n_col, const I R, const I C,
596 const I Ap[], const I Aj[], const T Ax[],
597 const I Bp[], const I Bj[], const T Bx[],
598 I Cp[], I Cj[], T2 Cx[])
599 {
600 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::not_equal_to<T>());
601 }
602
603 template <class I, class T, class T2>
bsr_lt_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])604 void bsr_lt_bsr(const I n_row, const I n_col, const I R, const I C,
605 const I Ap[], const I Aj[], const T Ax[],
606 const I Bp[], const I Bj[], const T Bx[],
607 I Cp[], I Cj[], T2 Cx[])
608 {
609 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::less<T>());
610 }
611
612 template <class I, class T, class T2>
bsr_gt_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])613 void bsr_gt_bsr(const I n_row, const I n_col, const I R, const I C,
614 const I Ap[], const I Aj[], const T Ax[],
615 const I Bp[], const I Bj[], const T Bx[],
616 I Cp[], I Cj[], T2 Cx[])
617 {
618 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::greater<T>());
619 }
620
621 template <class I, class T, class T2>
bsr_le_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])622 void bsr_le_bsr(const I n_row, const I n_col, const I R, const I C,
623 const I Ap[], const I Aj[], const T Ax[],
624 const I Bp[], const I Bj[], const T Bx[],
625 I Cp[], I Cj[], T2 Cx[])
626 {
627 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::less_equal<T>());
628 }
629
630 template <class I, class T, class T2>
bsr_ge_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T2 Cx[])631 void bsr_ge_bsr(const I n_row, const I n_col, const I R, const I C,
632 const I Ap[], const I Aj[], const T Ax[],
633 const I Bp[], const I Bj[], const T Bx[],
634 I Cp[], I Cj[], T2 Cx[])
635 {
636 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::greater_equal<T>());
637 }
638
639 template <class I, class T>
bsr_elmul_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])640 void bsr_elmul_bsr(const I n_row, const I n_col, const I R, const I C,
641 const I Ap[], const I Aj[], const T Ax[],
642 const I Bp[], const I Bj[], const T Bx[],
643 I Cp[], I Cj[], T Cx[])
644 {
645 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::multiplies<T>());
646 }
647
648 template <class I, class T>
bsr_eldiv_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])649 void bsr_eldiv_bsr(const I n_row, const I n_col, const I R, const I C,
650 const I Ap[], const I Aj[], const T Ax[],
651 const I Bp[], const I Bj[], const T Bx[],
652 I Cp[], I Cj[], T Cx[])
653 {
654 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::divides<T>());
655 }
656
657
658 template <class I, class T>
bsr_plus_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])659 void bsr_plus_bsr(const I n_row, const I n_col, const I R, const I C,
660 const I Ap[], const I Aj[], const T Ax[],
661 const I Bp[], const I Bj[], const T Bx[],
662 I Cp[], I Cj[], T Cx[])
663 {
664 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::plus<T>());
665 }
666
667 template <class I, class T>
bsr_minus_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])668 void bsr_minus_bsr(const I n_row, const I n_col, const I R, const I C,
669 const I Ap[], const I Aj[], const T Ax[],
670 const I Bp[], const I Bj[], const T Bx[],
671 I Cp[], I Cj[], T Cx[])
672 {
673 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,std::minus<T>());
674 }
675
676
677 template <class I, class T>
bsr_maximum_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])678 void bsr_maximum_bsr(const I n_row, const I n_col, const I R, const I C,
679 const I Ap[], const I Aj[], const T Ax[],
680 const I Bp[], const I Bj[], const T Bx[],
681 I Cp[], I Cj[], T Cx[])
682 {
683 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,maximum<T>());
684 }
685
686 template <class I, class T>
bsr_minimum_bsr(const I n_row,const I n_col,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const I Bp[],const I Bj[],const T Bx[],I Cp[],I Cj[],T Cx[])687 void bsr_minimum_bsr(const I n_row, const I n_col, const I R, const I C,
688 const I Ap[], const I Aj[], const T Ax[],
689 const I Bp[], const I Bj[], const T Bx[],
690 I Cp[], I Cj[], T Cx[])
691 {
692 bsr_binop_bsr(n_row,n_col,R,C,Ap,Aj,Ax,Bp,Bj,Bx,Cp,Cj,Cx,minimum<T>());
693 }
694
695
696 /*
697 * Compute B = A for BSR matrix A, CSR matrix B
698 *
699 * Input Arguments:
700 * I n_brow - number of block rows in A
701 * I n_bcol - number of block columns in A
702 * I R - row blocksize
703 * I C - column blocksize
704 * I Ap[n_brow+1] - block row pointer
705 * I Aj[nnz(A)] - block column indices
706 * T Ax[nnz(A)] - nonzero blocks
707 *
708 * Output Arguments:
709 * I Bp[n_brow*R + 1]- row pointer
710 * I Bj[nnz(B)] - column indices
711 * T Bx[nnz(B)] - nonzero values
712 *
713 * Note:
714 * Complexity: Linear. Specifically O(nnz(A) + max(n_row,n_col))
715 * Output arrays must be preallocated
716 *
717 * Note:
718 * Input: column indices *are not* assumed to be in sorted order or unique
719 * Output: the block column (unsorted) orders, duplicates,
720 * and explicit zeros are preserved
721 *
722 */
723 template <class I, class T>
bsr_tocsr(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],I Bp[],I Bj[],T Bx[])724 void bsr_tocsr(const I n_brow, const I n_bcol, const I R, const I C,
725 const I Ap[], const I Aj[], const T Ax[],
726 I Bp[], I Bj[], T Bx[])
727 {
728 // number of elements per block
729 const I RC = R*C;
730 // nnz
731 const I nnz = Ap[n_brow] * RC;
732 // last element in Bp is always nnz
733 Bp[n_brow * R] = nnz;
734 // loop for block row
735 for(I brow = 0; brow < n_brow; brow++){
736 // size of block rows
737 const I brow_size = Ap[brow + 1] - Ap[brow];
738 // size of row in csr
739 const I row_size = C * brow_size;
740 // loop of rows inside block
741 for(I r = 0; r < R; r++){
742 // csr row number
743 const I row = R * brow + r;
744 Bp[row] = RC * Ap[brow] + r * row_size;
745 // loop for block column
746 // block index inside row as loop variable
747 for (I bjj = 0; bjj < brow_size; bjj++)
748 {
749 const I b_ind = Ap[brow] + bjj;
750 // block column number
751 const I bcol = Aj[b_ind];
752 // loop for columns inside block
753 for (I c = 0; c < C; c++)
754 {
755 // bsr data index in Ax
756 // Ax is in C order
757 const I b_data_ind = RC * b_ind + C * r + c;
758 // csr column number
759 const I col = C * bcol + c;
760 // csr data anc col index in Bj and Bx
761 // start from Bp[row], offset by current bjj*C and c
762 const I data_ind = Bp[row] + bjj * C + c;
763 // assign col and data to Bj and Bx
764 Bj[data_ind] = col;
765 Bx[data_ind] = Ax[b_data_ind];
766 }
767 }
768 }
769 }
770 }
771
772
773 template <class I, class T>
bsr_matvec(const I n_brow,const I n_bcol,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const T Xx[],T Yx[])774 void bsr_matvec(const I n_brow,
775 const I n_bcol,
776 const I R,
777 const I C,
778 const I Ap[],
779 const I Aj[],
780 const T Ax[],
781 const T Xx[],
782 T Yx[])
783 {
784 assert(R > 0 && C > 0);
785
786 if( R == 1 && C == 1 ){
787 //use CSR for 1x1 blocksize
788 csr_matvec(n_brow, n_bcol, Ap, Aj, Ax, Xx, Yx);
789 return;
790 }
791
792 const npy_intp RC = (npy_intp)R*C;
793 for(I i = 0; i < n_brow; i++){
794 T * y = Yx + (npy_intp)R * i;
795 for(I jj = Ap[i]; jj < Ap[i+1]; jj++){
796 const I j = Aj[jj];
797 const T * A = Ax + RC * jj;
798 const T * x = Xx + (npy_intp)C * j;
799 gemv(R, C, A, x, y); // y += A*x
800 }
801 }
802 }
803
804
805 /*
806 * Compute Y += A*X for BSR matrix A and dense block vectors X,Y
807 *
808 *
809 * Input Arguments:
810 * I n_brow - number of row blocks in A
811 * I n_bcol - number of column blocks in A
812 * I n_vecs - number of column vectors in X and Y
813 * I R - rows per block
814 * I C - columns per block
815 * I Ap[n_brow+1] - row pointer
816 * I Aj[nblks(A)] - column indices
817 * T Ax[nnz(A)] - nonzeros
818 * T Xx[C*n_bcol,n_vecs] - input vector
819 *
820 * Output Arguments:
821 * T Yx[R*n_brow,n_vecs] - output vector
822 *
823 */
824 template <class I, class T>
bsr_matvecs(const I n_brow,const I n_bcol,const I n_vecs,const I R,const I C,const I Ap[],const I Aj[],const T Ax[],const T Xx[],T Yx[])825 void bsr_matvecs(const I n_brow,
826 const I n_bcol,
827 const I n_vecs,
828 const I R,
829 const I C,
830 const I Ap[],
831 const I Aj[],
832 const T Ax[],
833 const T Xx[],
834 T Yx[])
835 {
836 assert(R > 0 && C > 0);
837
838 if( R == 1 && C == 1 ){
839 //use CSR for 1x1 blocksize
840 csr_matvecs(n_brow, n_bcol, n_vecs, Ap, Aj, Ax, Xx, Yx);
841 return;
842 }
843
844 const npy_intp A_bs = (npy_intp)R*C; //Ax blocksize
845 const npy_intp Y_bs = (npy_intp)n_vecs*R; //Yx blocksize
846 const npy_intp X_bs = (npy_intp)C*n_vecs; //Xx blocksize
847
848 for(I i = 0; i < n_brow; i++){
849 T * y = Yx + Y_bs * i;
850 for(I jj = Ap[i]; jj < Ap[i+1]; jj++){
851 const I j = Aj[jj];
852 const T * A = Ax + A_bs * jj;
853 const T * x = Xx + X_bs * j;
854 gemm(R, n_vecs, C, A, x, y); // y += A*x
855 }
856 }
857 }
858
859
860 #endif
861