1 /* ========================================================================== */ 2 /* === ssmult_template.c ==================================================== */ 3 /* ========================================================================== */ 4 5 /* C = A*B, where A and B are sparse. The column pointers for C (the Cp array) 6 * have already been computed. entries are dropped. This code fragment is 7 * #include'd into ssmult.c four times, with all four combinations of ACOMPLEX 8 * (defined or not) and BCOMPLEX (defined or not). 9 * 10 * By default, C is returned with sorted column indices, and with explicit 11 * zero entries dropped. If C is complex with an all-zero imaginary part, then 12 * the imaginary part is freed and C becomes real. Thus, C is a pure MATLAB 13 * sparse matrix. 14 * 15 * If UNSORTED is defined (-DUNSORTED), then the nonzero pattern of C is 16 * returned with unsorted column indices. This is much faster than returning a 17 * pure MATLAB sparse matrix, but the result must eventually be sorted prior to 18 * returning to MATLAB. 19 * 20 * If the compiler bug discussed below does not affect you, then uncomment the 21 * following line, or compile with code with -DNO_GCC_BUG. 22 23 #define NO_GCC_BUG 24 25 * The gcc bug occurs when cij underflows to zero: 26 * 27 * cij = aik * bkj ; 28 * if (cij == 0) 29 * { 30 * drop this entry 31 * } 32 * 33 * If cij underflows, cij is zero but the above test is incorrectly FALSE with 34 * gcc -O, using gcc version 4.1.0 on an Intel Pentium. The bug does not appear 35 * on an AMD Opteron with the same compiler. The solution is to store cij to 36 * memory first, and then to read it back in and test it, which is slower. 37 */ 38 39 /* -------------------------------------------------------------------------- */ 40 /* MULT: multiply (or multiply and accumulate, depending on op) */ 41 /* -------------------------------------------------------------------------- */ 42 43 /* op can be "=" or "+=" */ 44 45 #ifdef ACOMPLEX 46 #ifdef BCOMPLEX 47 #define MULT(x,z,op) \ 48 azik = (ac ? (-Az [pa]) : (Az [pa])) ; \ 49 x op Ax [pa] * bkj - azik * bzkj ; \ 50 z op azik * bkj + Ax [pa] * bzkj ; 51 #else 52 #define MULT(x,z,op) \ 53 azik = (ac ? (-Az [pa]) : (Az [pa])) ; \ 54 x op Ax [pa] * bkj ; \ 55 z op azik * bkj ; 56 #endif 57 #else 58 #ifdef BCOMPLEX 59 #define MULT(x,z,op) \ 60 x op Ax [pa] * bkj ; \ 61 z op Ax [pa] * bzkj ; 62 #else 63 #define MULT(x,z,op) \ 64 x op Ax [pa] * bkj ; 65 #endif 66 #endif 67 68 /* -------------------------------------------------------------------------- */ 69 /* ASSIGN_BKJ: copy B(k,j) into a local scalar */ 70 /* -------------------------------------------------------------------------- */ 71 72 #ifdef BCOMPLEX 73 #define ASSIGN_BKJ \ 74 bkj = Bx [pb] ; \ 75 bzkj = (bc ? (-Bz [pb]) : (Bz [pb])) ; 76 #else 77 #define ASSIGN_BKJ \ 78 bkj = Bx [pb] ; 79 #endif 80 81 /* -------------------------------------------------------------------------- */ 82 /* DROP_CHECK: check if an entry must be dropped */ 83 /* -------------------------------------------------------------------------- */ 84 85 #if defined (ACOMPLEX) || defined (BCOMPLEX) 86 #define DROP_CHECK(x,z) \ 87 if (x == 0 && z == 0) drop = 1 ; \ 88 if (z != 0) zallzero = 0 ; 89 #else 90 #define DROP_CHECK(x,z) if (x == 0) drop = 1 ; 91 #endif 92 93 /* -------------------------------------------------------------------------- */ 94 /* sparse matrix multiply template */ 95 /* -------------------------------------------------------------------------- */ 96 97 { 98 99 #ifdef ACOMPLEX 100 double azik ; 101 #endif 102 103 /* ---------------------------------------------------------------------- */ 104 /* initialize drop tests */ 105 /* ---------------------------------------------------------------------- */ 106 107 drop = 0 ; /* true if any entry in C is zero */ 108 zallzero = 1 ; /* true if Cz is all zero */ 109 110 /* ---------------------------------------------------------------------- */ 111 /* quick check if A is diagonal, or a permutation matrix */ 112 /* ---------------------------------------------------------------------- */ 113 114 if (Anrow == Ancol && Ap [Ancol] == Ancol) 115 { 116 /* A is square, with n == nnz (A); check the pattern */ 117 A_is_permutation = 1 ; 118 A_is_diagonal = 1 ; 119 for (j = 0 ; j < Ancol ; j++) 120 { 121 if (Ap [j] != j) 122 { 123 /* A has a column with no entries, or more than 1 entry */ 124 A_is_permutation = 0 ; 125 A_is_diagonal = 0 ; 126 break ; 127 } 128 } 129 mark-- ; /* Flag [0..n-1] != mark is now true */ 130 for (j = 0 ; j < Ancol && (A_is_permutation || A_is_diagonal) ; j++) 131 { 132 /* A has one entry in each column, so j == Ap [j] */ 133 i = Ai [j] ; 134 if (i != j) 135 { 136 /* A is not diagonal, but might still be a permutation */ 137 A_is_diagonal = 0 ; 138 } 139 if (Flag [i] == mark) 140 { 141 /* row i appears twice; A is neither permutation nor diagonal */ 142 A_is_permutation = 0 ; 143 A_is_diagonal = 0 ; 144 } 145 /* mark row i, so we know if we see it again */ 146 Flag [i] = mark ; 147 } 148 } 149 else 150 { 151 /* A is not square, or nnz (A) is not equal to n */ 152 A_is_permutation = 0 ; 153 A_is_diagonal = 0 ; 154 } 155 156 /* ---------------------------------------------------------------------- */ 157 /* allocate workspace */ 158 /* ---------------------------------------------------------------------- */ 159 160 #ifndef UNSORTED 161 W = NULL ; 162 if (!A_is_diagonal) 163 { 164 #if defined (ACOMPLEX) || defined (BCOMPLEX) 165 W = mxMalloc (Anrow * 2 * sizeof (double)) ; 166 Wz = W + Anrow ; 167 #else 168 W = mxMalloc (Anrow * sizeof (double)) ; 169 #endif 170 } 171 #endif 172 173 /* ---------------------------------------------------------------------- */ 174 /* compute C one column at a time */ 175 /* ---------------------------------------------------------------------- */ 176 177 if (A_is_diagonal) 178 { 179 180 /* ------------------------------------------------------------------ */ 181 /* C = A*B where A is diagonal */ 182 /* ------------------------------------------------------------------ */ 183 184 pb = 0 ; 185 for (j = 0 ; j < Bncol ; j++) 186 { 187 pcstart = pb ; 188 pbend = Bp [j+1] ; /* column B is in Bi,Bx,Bz [pb ... pbend+1] */ 189 for ( ; pb < pbend ; pb++) 190 { 191 k = Bi [pb] ; /* nonzero entry B(k,j) */ 192 ASSIGN_BKJ ; 193 Ci [pb] = k ; 194 pa = k ; 195 MULT (Cx [pb], Cz [pb], =) ; /* C(k,j) = A(k,k)*B(k,j) */ 196 #ifdef NO_GCC_BUG 197 DROP_CHECK (Cx [pb], Cz [pb]) ; /* check if C(k,j) == 0 */ 198 #endif 199 } 200 201 #ifndef NO_GCC_BUG 202 for (pc = pcstart ; pc < pbend ; pc++) 203 { 204 DROP_CHECK (Cx [pc], Cz [pc]) ; /* check if C(k,j) == 0 */ 205 } 206 #endif 207 } 208 209 } 210 else 211 { 212 213 /* ------------------------------------------------------------------ */ 214 /* C = A*B, general case, or A permutation */ 215 /* ------------------------------------------------------------------ */ 216 217 pb = 0 ; 218 cnz = 0 ; 219 for (j = 0 ; j < Bncol ; j++) 220 { 221 222 /* -------------------------------------------------------------- */ 223 /* compute jth column of C: C(:,j) = A * B(:,j) */ 224 /* -------------------------------------------------------------- */ 225 226 pbend = Bp [j+1] ; /* column B is in Bi,Bx,Bz [pb ... pbend+1] */ 227 pcstart = cnz ; /* start of column j in C */ 228 blen = pbend - pb ; /* number of entries in B */ 229 needs_sorting = 0 ; /* true if column j needs sorting */ 230 231 if (blen == 0) 232 { 233 234 /* ---------------------------------------------------------- */ 235 /* nothing to do, B(:,j) and C(:,j) are empty */ 236 /* ---------------------------------------------------------- */ 237 238 continue ; 239 240 } 241 else if (blen == 1) 242 { 243 244 /* ---------------------------------------------------------- */ 245 /* B(:,j) contains only one nonzero */ 246 /* ---------------------------------------------------------- */ 247 248 /* since there is only one entry in B, just scale column A(:,k): 249 * C(:,j) = A(:,k) * B(k,j) 250 * C is sorted only if A is sorted on input */ 251 252 k = Bi [pb] ; /* nonzero entry B(k,j) */ 253 ASSIGN_BKJ ; 254 paend = Ap [k+1] ; 255 for (pa = Ap [k] ; pa < paend ; pa++, cnz++) 256 { 257 Ci [cnz] = Ai [pa] ; /* nonzero entry A(i,k) */ 258 MULT (Cx [cnz], Cz [cnz], =) ; /* C(i,j) = A(i,k)*B(k,j) */ 259 #ifdef NO_GCC_BUG 260 DROP_CHECK (Cx [cnz], Cz [cnz]) ; /* check C(i,j) == 0 */ 261 #endif 262 } 263 pb++ ; 264 265 #ifndef NO_GCC_BUG 266 for (pc = pcstart ; pc < cnz ; pc++) 267 { 268 DROP_CHECK (Cx [pc], Cz [pc]) ; /* check if C(i,j) == 0 */ 269 } 270 #endif 271 272 } 273 else 274 { 275 276 /* ---------------------------------------------------------- */ 277 /* B(:,j) has two or more entries */ 278 /* ---------------------------------------------------------- */ 279 280 if (A_is_permutation) 281 { 282 283 /* ------------------------------------------------------ */ 284 /* A is a permutation matrix */ 285 /* ------------------------------------------------------ */ 286 287 needs_sorting = 1 ; 288 for ( ; pb < pbend ; pb++) 289 { 290 k = Bi [pb] ; /* nonzero entry B(k,j) */ 291 ASSIGN_BKJ ; 292 i = Ai [k] ; /* nonzero entry A(i,k) */ 293 Ci [pb] = i ; 294 pa = k ; 295 /* C(i,j) = A(i,k)*B(k,j) */ 296 #ifndef UNSORTED 297 MULT (W [i], Wz [i], =) ; 298 #else 299 MULT (Cx [pb], Cz [pb], =) ; 300 #endif 301 } 302 cnz = pbend ; 303 304 } 305 else 306 { 307 308 /* ------------------------------------------------------ */ 309 /* general case */ 310 /* ------------------------------------------------------ */ 311 312 /* first entry in jth column of B is simpler */ 313 /* C(:,j) = A (:,k) * B (k,j) */ 314 k = Bi [pb] ; /* nonzero entry B(k,j) */ 315 ASSIGN_BKJ ; 316 paend = Ap [k+1] ; 317 for (pa = Ap [k] ; pa < paend ; pa++) 318 { 319 i = Ai [pa] ; /* nonzero entry A(i,k) */ 320 Flag [i] = cnz ; 321 Ci [cnz] = i ; /* new entry C(i,j) */ 322 /* C(i,j) = A(i,k)*B(k,j) */ 323 #ifndef UNSORTED 324 MULT (W [i], Wz [i], =) ; 325 #else 326 MULT (Cx [cnz], Cz [cnz], =) ; 327 #endif 328 cnz++ ; 329 } 330 pb++ ; 331 for ( ; pb < pbend ; pb++) 332 { 333 k = Bi [pb] ; /* nonzero entry B(k,j) */ 334 ASSIGN_BKJ ; 335 /* C(:,j) += A (:,k) * B (k,j) */ 336 paend = Ap [k+1] ; 337 for (pa = Ap [k] ; pa < paend ; pa++) 338 { 339 i = Ai [pa] ; /* nonzero entry A(i,k) */ 340 pc = Flag [i] ; 341 if (pc < pcstart) 342 { 343 pc = cnz++ ; 344 Flag [i] = pc ; 345 Ci [pc] = i ; /* new entry C(i,j) */ 346 /* C(i,j) = A(i,k)*B(k,j) */ 347 #ifndef UNSORTED 348 MULT (W [i], Wz [i], =) ; 349 needs_sorting = 1 ; 350 #else 351 MULT (Cx [pc], Cz [pc], =) ; 352 #endif 353 } 354 else 355 { 356 /* C(i,j) += A(i,k)*B(k,j) */ 357 #ifndef UNSORTED 358 MULT (W [i], Wz [i], +=) ; 359 #else 360 MULT (Cx [pc], Cz [pc], +=) ; 361 #endif 362 } 363 } 364 } 365 } 366 367 /* ---------------------------------------------------------- */ 368 /* sort the pattern of C(:,j) and gather the values of C(:,j) */ 369 /* ---------------------------------------------------------- */ 370 371 #ifndef UNSORTED 372 /* Sort the row indices in C(:,j). Use Cx as Int workspace. 373 * This assumes sizeof (Int) < sizeof (double). If blen <= 1, 374 * or if subsequent entries in B(:,j) appended entries onto C, 375 * there is no need to sort C(:,j), assuming A is sorted. */ 376 if (needs_sorting) 377 { 378 ssmergesort (Ci + pcstart, (Int *) (Cx + pcstart), 379 cnz - pcstart) ; 380 } 381 for (pc = pcstart ; pc < cnz ; pc++) 382 { 383 #if defined (ACOMPLEX) || defined (BCOMPLEX) 384 i = Ci [pc] ; 385 cij = W [i] ; /* get C(i,j) from W */ 386 czij = Wz [i] ; 387 Cx [pc] = cij ; /* copy C(i,j) into C */ 388 Cz [pc] = czij ; 389 #else 390 cij = W [Ci [pc]] ; /* get C(i,j) from W */ 391 Cx [pc] = cij ; /* copy C(i,j) into C */ 392 #endif 393 DROP_CHECK (cij, czij) ; /* check if C(i,j) == 0 */ 394 } 395 #else 396 /* no need to sort, but we do need to check for drop */ 397 for (pc = pcstart ; pc < cnz ; pc++) 398 { 399 DROP_CHECK (Cx [pc], Cz [pc]) ; /* check if C(i,j) == 0 */ 400 } 401 #endif 402 } 403 } 404 } 405 406 /* ---------------------------------------------------------------------- */ 407 /* free workspace */ 408 /* ---------------------------------------------------------------------- */ 409 410 #ifndef UNSORTED 411 mxFree (W) ; 412 #endif 413 414 415 } 416 417 #undef ACOMPLEX 418 #undef BCOMPLEX 419 #undef MULT 420 #undef ASSIGN_BKJ 421 #undef DROP_CHECK 422