1 /*!
2 * \file
3 *
4 * \brief Various routines with dealing with CSR matrices
5 *
6 * \author George Karypis
7 * \version\verbatim $Id: csr.c 13437 2013-01-11 21:54:10Z karypis $ \endverbatim
8 */
9
10 #include <GKlib.h>
11
12 #define OMPMINOPS 50000
13
14 /*************************************************************************/
15 /*! Allocate memory for a CSR matrix and initializes it
16 \returns the allocated matrix. The various fields are set to NULL.
17 */
18 /**************************************************************************/
gk_csr_Create()19 gk_csr_t *gk_csr_Create()
20 {
21 gk_csr_t *mat;
22
23 mat = (gk_csr_t *)gk_malloc(sizeof(gk_csr_t), "gk_csr_Create: mat");
24
25 gk_csr_Init(mat);
26
27 return mat;
28 }
29
30
31 /*************************************************************************/
32 /*! Initializes the matrix
33 \param mat is the matrix to be initialized.
34 */
35 /*************************************************************************/
gk_csr_Init(gk_csr_t * mat)36 void gk_csr_Init(gk_csr_t *mat)
37 {
38 memset(mat, 0, sizeof(gk_csr_t));
39 mat->nrows = mat->ncols = -1;
40 }
41
42
43 /*************************************************************************/
44 /*! Frees all the memory allocated for matrix.
45 \param mat is the matrix to be freed.
46 */
47 /*************************************************************************/
gk_csr_Free(gk_csr_t ** mat)48 void gk_csr_Free(gk_csr_t **mat)
49 {
50 if (*mat == NULL)
51 return;
52 gk_csr_FreeContents(*mat);
53 gk_free((void **)mat, LTERM);
54 }
55
56
57 /*************************************************************************/
58 /*! Frees only the memory allocated for the matrix's different fields and
59 sets them to NULL.
60 \param mat is the matrix whose contents will be freed.
61 */
62 /*************************************************************************/
gk_csr_FreeContents(gk_csr_t * mat)63 void gk_csr_FreeContents(gk_csr_t *mat)
64 {
65 gk_free((void *)&mat->rowptr, &mat->rowind, &mat->rowval, &mat->rowids,
66 &mat->colptr, &mat->colind, &mat->colval, &mat->colids,
67 &mat->rnorms, &mat->cnorms, &mat->rsums, &mat->csums,
68 &mat->rsizes, &mat->csizes, &mat->rvols, &mat->cvols,
69 &mat->rwgts, &mat->cwgts,
70 LTERM);
71 }
72
73
74 /*************************************************************************/
75 /*! Returns a copy of a matrix.
76 \param mat is the matrix to be duplicated.
77 \returns the newly created copy of the matrix.
78 */
79 /**************************************************************************/
gk_csr_Dup(gk_csr_t * mat)80 gk_csr_t *gk_csr_Dup(gk_csr_t *mat)
81 {
82 gk_csr_t *nmat;
83
84 nmat = gk_csr_Create();
85
86 nmat->nrows = mat->nrows;
87 nmat->ncols = mat->ncols;
88
89 /* copy the row structure */
90 if (mat->rowptr)
91 nmat->rowptr = gk_zcopy(mat->nrows+1, mat->rowptr,
92 gk_zmalloc(mat->nrows+1, "gk_csr_Dup: rowptr"));
93 if (mat->rowids)
94 nmat->rowids = gk_icopy(mat->nrows, mat->rowids,
95 gk_imalloc(mat->nrows, "gk_csr_Dup: rowids"));
96 if (mat->rnorms)
97 nmat->rnorms = gk_fcopy(mat->nrows, mat->rnorms,
98 gk_fmalloc(mat->nrows, "gk_csr_Dup: rnorms"));
99 if (mat->rowind)
100 nmat->rowind = gk_icopy(mat->rowptr[mat->nrows], mat->rowind,
101 gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowind"));
102 if (mat->rowval)
103 nmat->rowval = gk_fcopy(mat->rowptr[mat->nrows], mat->rowval,
104 gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowval"));
105
106 /* copy the col structure */
107 if (mat->colptr)
108 nmat->colptr = gk_zcopy(mat->ncols+1, mat->colptr,
109 gk_zmalloc(mat->ncols+1, "gk_csr_Dup: colptr"));
110 if (mat->colids)
111 nmat->colids = gk_icopy(mat->ncols, mat->colids,
112 gk_imalloc(mat->ncols, "gk_csr_Dup: colids"));
113 if (mat->cnorms)
114 nmat->cnorms = gk_fcopy(mat->ncols, mat->cnorms,
115 gk_fmalloc(mat->ncols, "gk_csr_Dup: cnorms"));
116 if (mat->colind)
117 nmat->colind = gk_icopy(mat->colptr[mat->ncols], mat->colind,
118 gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colind"));
119 if (mat->colval)
120 nmat->colval = gk_fcopy(mat->colptr[mat->ncols], mat->colval,
121 gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colval"));
122
123 return nmat;
124 }
125
126
127 /*************************************************************************/
128 /*! Returns a submatrix containint a set of consecutive rows.
129 \param mat is the original matrix.
130 \param rstart is the starting row.
131 \param nrows is the number of rows from rstart to extract.
132 \returns the row structure of the newly created submatrix.
133 */
134 /**************************************************************************/
gk_csr_ExtractSubmatrix(gk_csr_t * mat,int rstart,int nrows)135 gk_csr_t *gk_csr_ExtractSubmatrix(gk_csr_t *mat, int rstart, int nrows)
136 {
137 ssize_t i;
138 gk_csr_t *nmat;
139
140 if (rstart+nrows > mat->nrows)
141 return NULL;
142
143 nmat = gk_csr_Create();
144
145 nmat->nrows = nrows;
146 nmat->ncols = mat->ncols;
147
148 /* copy the row structure */
149 if (mat->rowptr)
150 nmat->rowptr = gk_zcopy(nrows+1, mat->rowptr+rstart,
151 gk_zmalloc(nrows+1, "gk_csr_ExtractSubmatrix: rowptr"));
152 for (i=nrows; i>=0; i--)
153 nmat->rowptr[i] -= nmat->rowptr[0];
154 ASSERT(nmat->rowptr[0] == 0);
155
156 if (mat->rowids)
157 nmat->rowids = gk_icopy(nrows, mat->rowids+rstart,
158 gk_imalloc(nrows, "gk_csr_ExtractSubmatrix: rowids"));
159 if (mat->rnorms)
160 nmat->rnorms = gk_fcopy(nrows, mat->rnorms+rstart,
161 gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rnorms"));
162
163 if (mat->rsums)
164 nmat->rsums = gk_fcopy(nrows, mat->rsums+rstart,
165 gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rsums"));
166
167 ASSERT(nmat->rowptr[nrows] == mat->rowptr[rstart+nrows]-mat->rowptr[rstart]);
168 if (mat->rowind)
169 nmat->rowind = gk_icopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
170 mat->rowind+mat->rowptr[rstart],
171 gk_imalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
172 "gk_csr_ExtractSubmatrix: rowind"));
173 if (mat->rowval)
174 nmat->rowval = gk_fcopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
175 mat->rowval+mat->rowptr[rstart],
176 gk_fmalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
177 "gk_csr_ExtractSubmatrix: rowval"));
178
179 return nmat;
180 }
181
182
183 /*************************************************************************/
184 /*! Returns a submatrix containing a certain set of rows.
185 \param mat is the original matrix.
186 \param nrows is the number of rows to extract.
187 \param rind is the set of row numbers to extract.
188 \returns the row structure of the newly created submatrix.
189 */
190 /**************************************************************************/
gk_csr_ExtractRows(gk_csr_t * mat,int nrows,int * rind)191 gk_csr_t *gk_csr_ExtractRows(gk_csr_t *mat, int nrows, int *rind)
192 {
193 ssize_t i, ii, j, nnz;
194 gk_csr_t *nmat;
195
196 nmat = gk_csr_Create();
197
198 nmat->nrows = nrows;
199 nmat->ncols = mat->ncols;
200
201 for (nnz=0, i=0; i<nrows; i++)
202 nnz += mat->rowptr[rind[i]+1]-mat->rowptr[rind[i]];
203
204 nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
205 nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
206 nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
207
208 nmat->rowptr[0] = 0;
209 for (nnz=0, j=0, ii=0; ii<nrows; ii++) {
210 i = rind[ii];
211 gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
212 gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
213 nnz += mat->rowptr[i+1]-mat->rowptr[i];
214 nmat->rowptr[++j] = nnz;
215 }
216 ASSERT(j == nmat->nrows);
217
218 return nmat;
219 }
220
221
222 /*************************************************************************/
223 /*! Returns a submatrix corresponding to a specified partitioning of rows.
224 \param mat is the original matrix.
225 \param part is the partitioning vector of the rows.
226 \param pid is the partition ID that will be extracted.
227 \returns the row structure of the newly created submatrix.
228 */
229 /**************************************************************************/
gk_csr_ExtractPartition(gk_csr_t * mat,int * part,int pid)230 gk_csr_t *gk_csr_ExtractPartition(gk_csr_t *mat, int *part, int pid)
231 {
232 ssize_t i, j, nnz;
233 gk_csr_t *nmat;
234
235 nmat = gk_csr_Create();
236
237 nmat->nrows = 0;
238 nmat->ncols = mat->ncols;
239
240 for (nnz=0, i=0; i<mat->nrows; i++) {
241 if (part[i] == pid) {
242 nmat->nrows++;
243 nnz += mat->rowptr[i+1]-mat->rowptr[i];
244 }
245 }
246
247 nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
248 nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
249 nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
250
251 nmat->rowptr[0] = 0;
252 for (nnz=0, j=0, i=0; i<mat->nrows; i++) {
253 if (part[i] == pid) {
254 gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
255 gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
256 nnz += mat->rowptr[i+1]-mat->rowptr[i];
257 nmat->rowptr[++j] = nnz;
258 }
259 }
260 ASSERT(j == nmat->nrows);
261
262 return nmat;
263 }
264
265
266 /*************************************************************************/
267 /*! Splits the matrix into multiple sub-matrices based on the provided
268 color array.
269 \param mat is the original matrix.
270 \param color is an array of size equal to the number of non-zeros
271 in the matrix (row-wise structure). The matrix is split into
272 as many parts as the number of colors. For meaningfull results,
273 the colors should be numbered consecutively starting from 0.
274 \returns an array of matrices for each supplied color number.
275 */
276 /**************************************************************************/
gk_csr_Split(gk_csr_t * mat,int * color)277 gk_csr_t **gk_csr_Split(gk_csr_t *mat, int *color)
278 {
279 ssize_t i, j;
280 int nrows, ncolors;
281 ssize_t *rowptr;
282 int *rowind;
283 float *rowval;
284 gk_csr_t **smats;
285
286 nrows = mat->nrows;
287 rowptr = mat->rowptr;
288 rowind = mat->rowind;
289 rowval = mat->rowval;
290
291 ncolors = gk_imax(rowptr[nrows], color)+1;
292
293 smats = (gk_csr_t **)gk_malloc(sizeof(gk_csr_t *)*ncolors, "gk_csr_Split: smats");
294 for (i=0; i<ncolors; i++) {
295 smats[i] = gk_csr_Create();
296 smats[i]->nrows = mat->nrows;
297 smats[i]->ncols = mat->ncols;
298 smats[i]->rowptr = gk_zsmalloc(nrows+1, 0, "gk_csr_Split: smats[i]->rowptr");
299 }
300
301 for (i=0; i<nrows; i++) {
302 for (j=rowptr[i]; j<rowptr[i+1]; j++)
303 smats[color[j]]->rowptr[i]++;
304 }
305 for (i=0; i<ncolors; i++)
306 MAKECSR(j, nrows, smats[i]->rowptr);
307
308 for (i=0; i<ncolors; i++) {
309 smats[i]->rowind = gk_imalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowind");
310 smats[i]->rowval = gk_fmalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowval");
311 }
312
313 for (i=0; i<nrows; i++) {
314 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
315 smats[color[j]]->rowind[smats[color[j]]->rowptr[i]] = rowind[j];
316 smats[color[j]]->rowval[smats[color[j]]->rowptr[i]] = rowval[j];
317 smats[color[j]]->rowptr[i]++;
318 }
319 }
320
321 for (i=0; i<ncolors; i++)
322 SHIFTCSR(j, nrows, smats[i]->rowptr);
323
324 return smats;
325 }
326
327
328 /**************************************************************************/
329 /*! Reads a CSR matrix from the supplied file and stores it the matrix's
330 forward structure.
331 \param filename is the file that stores the data.
332 \param format is either GK_CSR_FMT_METIS, GK_CSR_FMT_CLUTO,
333 GK_CSR_FMT_CSR, GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL
334 specifying the type of the input format.
335 The GK_CSR_FMT_CSR does not contain a header
336 line, whereas the GK_CSR_FMT_BINROW is a binary format written
337 by gk_csr_Write() using the same format specifier.
338 \param readvals is either 1 or 0, indicating if the CSR file contains
339 values or it does not. It only applies when GK_CSR_FMT_CSR is
340 used.
341 \param numbering is either 1 or 0, indicating if the numbering of the
342 indices start from 1 or 0, respectively. If they start from 1,
343 they are automatically decreamented during input so that they
344 will start from 0. It only applies when GK_CSR_FMT_CSR is
345 used.
346 \returns the matrix that was read.
347 */
348 /**************************************************************************/
gk_csr_Read(char * filename,int format,int readvals,int numbering)349 gk_csr_t *gk_csr_Read(char *filename, int format, int readvals, int numbering)
350 {
351 ssize_t i, k, l;
352 size_t nfields, nrows, ncols, nnz, fmt, ncon;
353 size_t lnlen;
354 ssize_t *rowptr;
355 int *rowind, ival;
356 float *rowval=NULL, fval;
357 int readsizes, readwgts;
358 char *line=NULL, *head, *tail, fmtstr[256];
359 FILE *fpin;
360 gk_csr_t *mat=NULL;
361
362
363 if (!gk_fexists(filename))
364 gk_errexit(SIGERR, "File %s does not exist!\n", filename);
365
366 if (format == GK_CSR_FMT_BINROW) {
367 mat = gk_csr_Create();
368
369 fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
370 if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
371 gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
372 if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
373 gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
374 mat->rowptr = gk_zmalloc(mat->nrows+1, "gk_csr_Read: rowptr");
375 if (fread(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpin) != mat->nrows+1)
376 gk_errexit(SIGERR, "Failed to read the rowptr from file %s!\n", filename);
377 mat->rowind = gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowind");
378 if (fread(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
379 gk_errexit(SIGERR, "Failed to read the rowind from file %s!\n", filename);
380 if (readvals == 1) {
381 mat->rowval = gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowval");
382 if (fread(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
383 gk_errexit(SIGERR, "Failed to read the rowval from file %s!\n", filename);
384 }
385
386 gk_fclose(fpin);
387 return mat;
388 }
389
390 if (format == GK_CSR_FMT_BINCOL) {
391 mat = gk_csr_Create();
392
393 fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
394 if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
395 gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
396 if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
397 gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
398 mat->colptr = gk_zmalloc(mat->ncols+1, "gk_csr_Read: colptr");
399 if (fread(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpin) != mat->ncols+1)
400 gk_errexit(SIGERR, "Failed to read the colptr from file %s!\n", filename);
401 mat->colind = gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Read: colind");
402 if (fread(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
403 gk_errexit(SIGERR, "Failed to read the colind from file %s!\n", filename);
404 if (readvals) {
405 mat->colval = gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Read: colval");
406 if (fread(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
407 gk_errexit(SIGERR, "Failed to read the colval from file %s!\n", filename);
408 }
409
410 gk_fclose(fpin);
411 return mat;
412 }
413
414
415 if (format == GK_CSR_FMT_CLUTO) {
416 fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
417 do {
418 if (gk_getline(&line, &lnlen, fpin) <= 0)
419 gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
420 } while (line[0] == '%');
421
422 if (sscanf(line, "%zu %zu %zu", &nrows, &ncols, &nnz) != 3)
423 gk_errexit(SIGERR, "Header line must contain 3 integers.\n");
424
425 readsizes = 0;
426 readwgts = 0;
427 readvals = 1;
428 numbering = 1;
429 }
430 else if (format == GK_CSR_FMT_METIS) {
431 fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
432 do {
433 if (gk_getline(&line, &lnlen, fpin) <= 0)
434 gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
435 } while (line[0] == '%');
436
437 fmt = ncon = 0;
438 nfields = sscanf(line, "%zu %zu %zu %zu", &nrows, &nnz, &fmt, &ncon);
439 if (nfields < 2)
440 gk_errexit(SIGERR, "Header line must contain at least 2 integers (#vtxs and #edges).\n");
441
442 ncols = nrows;
443 nnz *= 2;
444
445 if (fmt > 111)
446 gk_errexit(SIGERR, "Cannot read this type of file format [fmt=%zu]!\n", fmt);
447
448 sprintf(fmtstr, "%03zu", fmt%1000);
449 readsizes = (fmtstr[0] == '1');
450 readwgts = (fmtstr[1] == '1');
451 readvals = (fmtstr[2] == '1');
452 numbering = 1;
453 ncon = (ncon == 0 ? 1 : ncon);
454 }
455 else {
456 readsizes = 0;
457 readwgts = 0;
458
459 gk_getfilestats(filename, &nrows, &nnz, NULL, NULL);
460
461 if (readvals == 1 && nnz%2 == 1)
462 gk_errexit(SIGERR, "Error: The number of numbers (%zd %d) in the input file is not even.\n", nnz, readvals);
463 if (readvals == 1)
464 nnz = nnz/2;
465 fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
466 }
467
468 mat = gk_csr_Create();
469
470 mat->nrows = nrows;
471
472 rowptr = mat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Read: rowptr");
473 rowind = mat->rowind = gk_imalloc(nnz, "gk_csr_Read: rowind");
474 if (readvals != 2)
475 rowval = mat->rowval = gk_fsmalloc(nnz, 1.0, "gk_csr_Read: rowval");
476
477 if (readsizes)
478 mat->rsizes = gk_fsmalloc(nrows, 0.0, "gk_csr_Read: rsizes");
479
480 if (readwgts)
481 mat->rwgts = gk_fsmalloc(nrows*ncon, 0.0, "gk_csr_Read: rwgts");
482
483 /*----------------------------------------------------------------------
484 * Read the sparse matrix file
485 *---------------------------------------------------------------------*/
486 numbering = (numbering ? - 1 : 0);
487 for (ncols=0, rowptr[0]=0, k=0, i=0; i<nrows; i++) {
488 do {
489 if (gk_getline(&line, &lnlen, fpin) == -1)
490 gk_errexit(SIGERR, "Premature end of input file: file while reading row %d\n", i);
491 } while (line[0] == '%');
492
493 head = line;
494 tail = NULL;
495
496 /* Read vertex sizes */
497 if (readsizes) {
498 #ifdef __MSC__
499 mat->rsizes[i] = (float)strtod(head, &tail);
500 #else
501 mat->rsizes[i] = strtof(head, &tail);
502 #endif
503 if (tail == head)
504 gk_errexit(SIGERR, "The line for vertex %zd does not have size information\n", i+1);
505 if (mat->rsizes[i] < 0)
506 errexit("The size for vertex %zd must be >= 0\n", i+1);
507 head = tail;
508 }
509
510 /* Read vertex weights */
511 if (readwgts) {
512 for (l=0; l<ncon; l++) {
513 #ifdef __MSC__
514 mat->rwgts[i*ncon+l] = (float)strtod(head, &tail);
515 #else
516 mat->rwgts[i*ncon+l] = strtof(head, &tail);
517 #endif
518 if (tail == head)
519 errexit("The line for vertex %zd does not have enough weights "
520 "for the %d constraints.\n", i+1, ncon);
521 if (mat->rwgts[i*ncon+l] < 0)
522 errexit("The weight vertex %zd and constraint %zd must be >= 0\n", i+1, l);
523 head = tail;
524 }
525 }
526
527
528 /* Read the rest of the row */
529 while (1) {
530 ival = (int)strtol(head, &tail, 0);
531 if (tail == head)
532 break;
533 head = tail;
534
535 if ((rowind[k] = ival + numbering) < 0)
536 gk_errexit(SIGERR, "Error: Invalid column number %d at row %zd.\n", ival, i);
537
538 ncols = gk_max(rowind[k], ncols);
539
540 if (readvals == 1) {
541 #ifdef __MSC__
542 fval = (float)strtod(head, &tail);
543 #else
544 fval = strtof(head, &tail);
545 #endif
546 if (tail == head)
547 gk_errexit(SIGERR, "Value could not be found for column! Row:%zd, NNZ:%zd\n", i, k);
548 head = tail;
549
550 rowval[k] = fval;
551 }
552 k++;
553 }
554 rowptr[i+1] = k;
555 }
556
557 if (format == GK_CSR_FMT_METIS) {
558 ASSERT(ncols+1 == mat->nrows);
559 mat->ncols = mat->nrows;
560 }
561 else {
562 mat->ncols = ncols+1;
563 }
564
565 if (k != nnz)
566 gk_errexit(SIGERR, "gk_csr_Read: Something wrong with the number of nonzeros in "
567 "the input file. NNZ=%zd, ActualNNZ=%zd.\n", nnz, k);
568
569 gk_fclose(fpin);
570
571 gk_free((void **)&line, LTERM);
572
573 return mat;
574 }
575
576
577 /**************************************************************************/
578 /*! Writes the row-based structure of a matrix into a file.
579 \param mat is the matrix to be written,
580 \param filename is the name of the output file.
581 \param format is one of: GK_CSR_FMT_CLUTO, GK_CSR_FMT_CSR,
582 GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL.
583 \param writevals is either 1 or 0 indicating if the values will be
584 written or not. This is only applicable when GK_CSR_FMT_CSR
585 is used.
586 \param numbering is either 1 or 0 indicating if the internal 0-based
587 numbering will be shifted by one or not during output. This
588 is only applicable when GK_CSR_FMT_CSR is used.
589 */
590 /**************************************************************************/
gk_csr_Write(gk_csr_t * mat,char * filename,int format,int writevals,int numbering)591 void gk_csr_Write(gk_csr_t *mat, char *filename, int format, int writevals, int numbering)
592 {
593 ssize_t i, j;
594 FILE *fpout;
595
596 if (format == GK_CSR_FMT_BINROW) {
597 if (filename == NULL)
598 gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
599 fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
600
601 fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);
602 fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);
603 fwrite(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpout);
604 fwrite(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpout);
605 if (writevals)
606 fwrite(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpout);
607
608 gk_fclose(fpout);
609 return;
610 }
611
612 if (format == GK_CSR_FMT_BINCOL) {
613 if (filename == NULL)
614 gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
615 fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
616
617 fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);
618 fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);
619 fwrite(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpout);
620 fwrite(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpout);
621 if (writevals)
622 fwrite(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpout);
623
624 gk_fclose(fpout);
625 return;
626 }
627
628 if (filename)
629 fpout = gk_fopen(filename, "w", "gk_csr_Write: fpout");
630 else
631 fpout = stdout;
632
633 if (format == GK_CSR_FMT_CLUTO) {
634 fprintf(fpout, "%d %d %zd\n", mat->nrows, mat->ncols, mat->rowptr[mat->nrows]);
635 writevals = 1;
636 numbering = 1;
637 }
638
639 for (i=0; i<mat->nrows; i++) {
640 for (j=mat->rowptr[i]; j<mat->rowptr[i+1]; j++) {
641 fprintf(fpout, " %d", mat->rowind[j]+(numbering ? 1 : 0));
642 if (writevals)
643 fprintf(fpout, " %f", mat->rowval[j]);
644 }
645 fprintf(fpout, "\n");
646 }
647 if (filename)
648 gk_fclose(fpout);
649 }
650
651
652 /*************************************************************************/
653 /*! Prunes certain rows/columns of the matrix. The prunning takes place
654 by analyzing the row structure of the matrix. The prunning takes place
655 by removing rows/columns but it does not affect the numbering of the
656 remaining rows/columns.
657
658 \param mat the matrix to be prunned,
659 \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
660 of the matrix will be prunned,
661 \param minf is the minimum number of rows (columns) that a column (row) must
662 be present in order to be kept,
663 \param maxf is the maximum number of rows (columns) that a column (row) must
664 be present at in order to be kept.
665 \returns the prunned matrix consisting only of its row-based structure.
666 The input matrix is not modified.
667 */
668 /**************************************************************************/
gk_csr_Prune(gk_csr_t * mat,int what,int minf,int maxf)669 gk_csr_t *gk_csr_Prune(gk_csr_t *mat, int what, int minf, int maxf)
670 {
671 ssize_t i, j, nnz;
672 int nrows, ncols;
673 ssize_t *rowptr, *nrowptr;
674 int *rowind, *nrowind, *collen;
675 float *rowval, *nrowval;
676 gk_csr_t *nmat;
677
678 nmat = gk_csr_Create();
679
680 nrows = nmat->nrows = mat->nrows;
681 ncols = nmat->ncols = mat->ncols;
682
683 rowptr = mat->rowptr;
684 rowind = mat->rowind;
685 rowval = mat->rowval;
686
687 nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Prune: nrowptr");
688 nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_Prune: nrowind");
689 nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_Prune: nrowval");
690
691
692 switch (what) {
693 case GK_CSR_COL:
694 collen = gk_ismalloc(ncols, 0, "gk_csr_Prune: collen");
695
696 for (i=0; i<nrows; i++) {
697 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
698 ASSERT(rowind[j] < ncols);
699 collen[rowind[j]]++;
700 }
701 }
702 for (i=0; i<ncols; i++)
703 collen[i] = (collen[i] >= minf && collen[i] <= maxf ? 1 : 0);
704
705 nrowptr[0] = 0;
706 for (nnz=0, i=0; i<nrows; i++) {
707 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
708 if (collen[rowind[j]]) {
709 nrowind[nnz] = rowind[j];
710 nrowval[nnz] = rowval[j];
711 nnz++;
712 }
713 }
714 nrowptr[i+1] = nnz;
715 }
716 gk_free((void **)&collen, LTERM);
717 break;
718
719 case GK_CSR_ROW:
720 nrowptr[0] = 0;
721 for (nnz=0, i=0; i<nrows; i++) {
722 if (rowptr[i+1]-rowptr[i] >= minf && rowptr[i+1]-rowptr[i] <= maxf) {
723 for (j=rowptr[i]; j<rowptr[i+1]; j++, nnz++) {
724 nrowind[nnz] = rowind[j];
725 nrowval[nnz] = rowval[j];
726 }
727 }
728 nrowptr[i+1] = nnz;
729 }
730 break;
731
732 default:
733 gk_csr_Free(&nmat);
734 gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
735 return NULL;
736 }
737
738 return nmat;
739 }
740
741
742 /*************************************************************************/
743 /*! Eliminates certain entries from the rows/columns of the matrix. The
744 filtering takes place by keeping only the highest weight entries whose
745 sum accounts for a certain fraction of the overall weight of the
746 row/column.
747
748 \param mat the matrix to be prunned,
749 \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
750 of the matrix will be prunned,
751 \param norm indicates the norm that will be used to aggregate the weights
752 and possible values are 1 or 2,
753 \param fraction is the fraction of the overall norm that will be retained
754 by the kept entries.
755 \returns the filtered matrix consisting only of its row-based structure.
756 The input matrix is not modified.
757 */
758 /**************************************************************************/
gk_csr_LowFilter(gk_csr_t * mat,int what,int norm,float fraction)759 gk_csr_t *gk_csr_LowFilter(gk_csr_t *mat, int what, int norm, float fraction)
760 {
761 ssize_t i, j, nnz;
762 int nrows, ncols, ncand, maxlen=0;
763 ssize_t *rowptr, *colptr, *nrowptr;
764 int *rowind, *colind, *nrowind;
765 float *rowval, *colval, *nrowval, rsum, tsum;
766 gk_csr_t *nmat;
767 gk_fkv_t *cand;
768
769 nmat = gk_csr_Create();
770
771 nrows = nmat->nrows = mat->nrows;
772 ncols = nmat->ncols = mat->ncols;
773
774 rowptr = mat->rowptr;
775 rowind = mat->rowind;
776 rowval = mat->rowval;
777 colptr = mat->colptr;
778 colind = mat->colind;
779 colval = mat->colval;
780
781 nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
782 nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
783 nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
784
785
786 switch (what) {
787 case GK_CSR_COL:
788 if (mat->colptr == NULL)
789 gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
790
791 gk_zcopy(nrows+1, rowptr, nrowptr);
792
793 for (i=0; i<ncols; i++)
794 maxlen = gk_max(maxlen, colptr[i+1]-colptr[i]);
795
796 #pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
797 {
798 cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
799
800 #pragma omp for schedule(static)
801 for (i=0; i<ncols; i++) {
802 for (tsum=0.0, ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
803 cand[ncand].val = colind[j];
804 cand[ncand].key = colval[j];
805 tsum += (norm == 1 ? colval[j] : colval[j]*colval[j]);
806 }
807 gk_fkvsortd(ncand, cand);
808
809 for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
810 rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
811 nrowind[nrowptr[cand[j].val]] = i;
812 nrowval[nrowptr[cand[j].val]] = cand[j].key;
813 nrowptr[cand[j].val]++;
814 }
815 }
816
817 gk_free((void **)&cand, LTERM);
818 }
819
820 /* compact the nrowind/nrowval */
821 for (nnz=0, i=0; i<nrows; i++) {
822 for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
823 nrowind[nnz] = nrowind[j];
824 nrowval[nnz] = nrowval[j];
825 }
826 nrowptr[i] = nnz;
827 }
828 SHIFTCSR(i, nrows, nrowptr);
829
830 break;
831
832 case GK_CSR_ROW:
833 if (mat->rowptr == NULL)
834 gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
835
836 for (i=0; i<nrows; i++)
837 maxlen = gk_max(maxlen, rowptr[i+1]-rowptr[i]);
838
839 #pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
840 {
841 cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
842
843 #pragma omp for schedule(static)
844 for (i=0; i<nrows; i++) {
845 for (tsum=0.0, ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
846 cand[ncand].val = rowind[j];
847 cand[ncand].key = rowval[j];
848 tsum += (norm == 1 ? rowval[j] : rowval[j]*rowval[j]);
849 }
850 gk_fkvsortd(ncand, cand);
851
852 for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
853 rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
854 nrowind[rowptr[i]+j] = cand[j].val;
855 nrowval[rowptr[i]+j] = cand[j].key;
856 }
857 nrowptr[i+1] = rowptr[i]+j;
858 }
859
860 gk_free((void **)&cand, LTERM);
861 }
862
863 /* compact nrowind/nrowval */
864 nrowptr[0] = nnz = 0;
865 for (i=0; i<nrows; i++) {
866 for (j=rowptr[i]; j<nrowptr[i+1]; j++, nnz++) {
867 nrowind[nnz] = nrowind[j];
868 nrowval[nnz] = nrowval[j];
869 }
870 nrowptr[i+1] = nnz;
871 }
872
873 break;
874
875 default:
876 gk_csr_Free(&nmat);
877 gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
878 return NULL;
879 }
880
881 return nmat;
882 }
883
884
885 /*************************************************************************/
886 /*! Eliminates certain entries from the rows/columns of the matrix. The
887 filtering takes place by keeping only the highest weight top-K entries
888 along each row/column and those entries whose weight is greater than
889 a specified value.
890
891 \param mat the matrix to be prunned,
892 \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
893 of the matrix will be prunned,
894 \param topk is the number of the highest weight entries to keep.
895 \param keepval is the weight of a term above which will be kept. This
896 is used to select additional terms past the first topk.
897 \returns the filtered matrix consisting only of its row-based structure.
898 The input matrix is not modified.
899 */
900 /**************************************************************************/
gk_csr_TopKPlusFilter(gk_csr_t * mat,int what,int topk,float keepval)901 gk_csr_t *gk_csr_TopKPlusFilter(gk_csr_t *mat, int what, int topk, float keepval)
902 {
903 ssize_t i, j, k, nnz;
904 int nrows, ncols, ncand;
905 ssize_t *rowptr, *colptr, *nrowptr;
906 int *rowind, *colind, *nrowind;
907 float *rowval, *colval, *nrowval;
908 gk_csr_t *nmat;
909 gk_fkv_t *cand;
910
911 nmat = gk_csr_Create();
912
913 nrows = nmat->nrows = mat->nrows;
914 ncols = nmat->ncols = mat->ncols;
915
916 rowptr = mat->rowptr;
917 rowind = mat->rowind;
918 rowval = mat->rowval;
919 colptr = mat->colptr;
920 colind = mat->colind;
921 colval = mat->colval;
922
923 nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
924 nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
925 nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
926
927
928 switch (what) {
929 case GK_CSR_COL:
930 if (mat->colptr == NULL)
931 gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
932
933 cand = gk_fkvmalloc(nrows, "gk_csr_LowFilter: cand");
934
935 gk_zcopy(nrows+1, rowptr, nrowptr);
936 for (i=0; i<ncols; i++) {
937 for (ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
938 cand[ncand].val = colind[j];
939 cand[ncand].key = colval[j];
940 }
941 gk_fkvsortd(ncand, cand);
942
943 k = gk_min(topk, ncand);
944 for (j=0; j<k; j++) {
945 nrowind[nrowptr[cand[j].val]] = i;
946 nrowval[nrowptr[cand[j].val]] = cand[j].key;
947 nrowptr[cand[j].val]++;
948 }
949 for (; j<ncand; j++) {
950 if (cand[j].key < keepval)
951 break;
952
953 nrowind[nrowptr[cand[j].val]] = i;
954 nrowval[nrowptr[cand[j].val]] = cand[j].key;
955 nrowptr[cand[j].val]++;
956 }
957 }
958
959 /* compact the nrowind/nrowval */
960 for (nnz=0, i=0; i<nrows; i++) {
961 for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
962 nrowind[nnz] = nrowind[j];
963 nrowval[nnz] = nrowval[j];
964 }
965 nrowptr[i] = nnz;
966 }
967 SHIFTCSR(i, nrows, nrowptr);
968
969 gk_free((void **)&cand, LTERM);
970 break;
971
972 case GK_CSR_ROW:
973 if (mat->rowptr == NULL)
974 gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
975
976 cand = gk_fkvmalloc(ncols, "gk_csr_LowFilter: cand");
977
978 nrowptr[0] = 0;
979 for (nnz=0, i=0; i<nrows; i++) {
980 for (ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
981 cand[ncand].val = rowind[j];
982 cand[ncand].key = rowval[j];
983 }
984 gk_fkvsortd(ncand, cand);
985
986 k = gk_min(topk, ncand);
987 for (j=0; j<k; j++, nnz++) {
988 nrowind[nnz] = cand[j].val;
989 nrowval[nnz] = cand[j].key;
990 }
991 for (; j<ncand; j++, nnz++) {
992 if (cand[j].key < keepval)
993 break;
994
995 nrowind[nnz] = cand[j].val;
996 nrowval[nnz] = cand[j].key;
997 }
998 nrowptr[i+1] = nnz;
999 }
1000
1001 gk_free((void **)&cand, LTERM);
1002 break;
1003
1004 default:
1005 gk_csr_Free(&nmat);
1006 gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
1007 return NULL;
1008 }
1009
1010 return nmat;
1011 }
1012
1013
1014 /*************************************************************************/
1015 /*! Eliminates certain entries from the rows/columns of the matrix. The
1016 filtering takes place by keeping only the terms whose contribution to
1017 the total length of the document is greater than a user-splied multiple
1018 over the average.
1019
1020 This routine assumes that the vectors are normalized to be unit length.
1021
1022 \param mat the matrix to be prunned,
1023 \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
1024 of the matrix will be prunned,
1025 \param zscore is the multiplicative factor over the average contribution
1026 to the length of the document.
1027 \returns the filtered matrix consisting only of its row-based structure.
1028 The input matrix is not modified.
1029 */
1030 /**************************************************************************/
gk_csr_ZScoreFilter(gk_csr_t * mat,int what,float zscore)1031 gk_csr_t *gk_csr_ZScoreFilter(gk_csr_t *mat, int what, float zscore)
1032 {
1033 ssize_t i, j, nnz;
1034 int nrows;
1035 ssize_t *rowptr, *nrowptr;
1036 int *rowind, *nrowind;
1037 float *rowval, *nrowval, avgwgt;
1038 gk_csr_t *nmat;
1039
1040 nmat = gk_csr_Create();
1041
1042 nmat->nrows = mat->nrows;
1043 nmat->ncols = mat->ncols;
1044
1045 nrows = mat->nrows;
1046 rowptr = mat->rowptr;
1047 rowind = mat->rowind;
1048 rowval = mat->rowval;
1049
1050 nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_ZScoreFilter: nrowptr");
1051 nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowind");
1052 nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowval");
1053
1054
1055 switch (what) {
1056 case GK_CSR_COL:
1057 gk_errexit(SIGERR, "This has not been implemented yet.\n");
1058 break;
1059
1060 case GK_CSR_ROW:
1061 if (mat->rowptr == NULL)
1062 gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
1063
1064 nrowptr[0] = 0;
1065 for (nnz=0, i=0; i<nrows; i++) {
1066 avgwgt = zscore/(rowptr[i+1]-rowptr[i]);
1067 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1068 if (rowval[j] > avgwgt) {
1069 nrowind[nnz] = rowind[j];
1070 nrowval[nnz] = rowval[j];
1071 nnz++;
1072 }
1073 }
1074 nrowptr[i+1] = nnz;
1075 }
1076 break;
1077
1078 default:
1079 gk_csr_Free(&nmat);
1080 gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
1081 return NULL;
1082 }
1083
1084 return nmat;
1085 }
1086
1087
1088 /*************************************************************************/
1089 /*! Compacts the column-space of the matrix by removing empty columns.
1090 As a result of the compaction, the column numbers are renumbered.
1091 The compaction operation is done in place and only affects the row-based
1092 representation of the matrix.
1093 The new columns are ordered in decreasing frequency.
1094
1095 \param mat the matrix whose empty columns will be removed.
1096 */
1097 /**************************************************************************/
gk_csr_CompactColumns(gk_csr_t * mat)1098 void gk_csr_CompactColumns(gk_csr_t *mat)
1099 {
1100 ssize_t i;
1101 int nrows, ncols, nncols;
1102 ssize_t *rowptr;
1103 int *rowind, *colmap;
1104 gk_ikv_t *clens;
1105
1106 nrows = mat->nrows;
1107 ncols = mat->ncols;
1108 rowptr = mat->rowptr;
1109 rowind = mat->rowind;
1110
1111 colmap = gk_imalloc(ncols, "gk_csr_CompactColumns: colmap");
1112
1113 clens = gk_ikvmalloc(ncols, "gk_csr_CompactColumns: clens");
1114 for (i=0; i<ncols; i++) {
1115 clens[i].key = 0;
1116 clens[i].val = i;
1117 }
1118
1119 for (i=0; i<rowptr[nrows]; i++)
1120 clens[rowind[i]].key++;
1121 gk_ikvsortd(ncols, clens);
1122
1123 for (nncols=0, i=0; i<ncols; i++) {
1124 if (clens[i].key > 0)
1125 colmap[clens[i].val] = nncols++;
1126 else
1127 break;
1128 }
1129
1130 for (i=0; i<rowptr[nrows]; i++)
1131 rowind[i] = colmap[rowind[i]];
1132
1133 mat->ncols = nncols;
1134
1135 gk_free((void **)&colmap, &clens, LTERM);
1136 }
1137
1138
1139 /*************************************************************************/
1140 /*! Sorts the indices in increasing order
1141 \param mat the matrix itself,
1142 \param what is either GK_CSR_ROW or GK_CSR_COL indicating which set of
1143 indices to sort.
1144 */
1145 /**************************************************************************/
gk_csr_SortIndices(gk_csr_t * mat,int what)1146 void gk_csr_SortIndices(gk_csr_t *mat, int what)
1147 {
1148 int n, nn=0;
1149 ssize_t *ptr;
1150 int *ind;
1151 float *val;
1152
1153 switch (what) {
1154 case GK_CSR_ROW:
1155 if (!mat->rowptr)
1156 gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
1157
1158 n = mat->nrows;
1159 ptr = mat->rowptr;
1160 ind = mat->rowind;
1161 val = mat->rowval;
1162 break;
1163
1164 case GK_CSR_COL:
1165 if (!mat->colptr)
1166 gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
1167
1168 n = mat->ncols;
1169 ptr = mat->colptr;
1170 ind = mat->colind;
1171 val = mat->colval;
1172 break;
1173
1174 default:
1175 gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1176 return;
1177 }
1178
1179 #pragma omp parallel if (n > 100)
1180 {
1181 ssize_t i, j, k;
1182 gk_ikv_t *cand;
1183 float *tval;
1184
1185 #pragma omp single
1186 for (i=0; i<n; i++)
1187 nn = gk_max(nn, ptr[i+1]-ptr[i]);
1188
1189 cand = gk_ikvmalloc(nn, "gk_csr_SortIndices: cand");
1190 tval = gk_fmalloc(nn, "gk_csr_SortIndices: tval");
1191
1192 #pragma omp for schedule(static)
1193 for (i=0; i<n; i++) {
1194 for (k=0, j=ptr[i]; j<ptr[i+1]; j++) {
1195 if (j > ptr[i] && ind[j] < ind[j-1])
1196 k = 1; /* an inversion */
1197 cand[j-ptr[i]].val = j-ptr[i];
1198 cand[j-ptr[i]].key = ind[j];
1199 tval[j-ptr[i]] = val[j];
1200 }
1201 if (k) {
1202 gk_ikvsorti(ptr[i+1]-ptr[i], cand);
1203 for (j=ptr[i]; j<ptr[i+1]; j++) {
1204 ind[j] = cand[j-ptr[i]].key;
1205 val[j] = tval[cand[j-ptr[i]].val];
1206 }
1207 }
1208 }
1209
1210 gk_free((void **)&cand, &tval, LTERM);
1211 }
1212
1213 }
1214
1215
1216 /*************************************************************************/
1217 /*! Creates a row/column index from the column/row data.
1218 \param mat the matrix itself,
1219 \param what is either GK_CSR_ROW or GK_CSR_COL indicating which index
1220 will be created.
1221 */
1222 /**************************************************************************/
gk_csr_CreateIndex(gk_csr_t * mat,int what)1223 void gk_csr_CreateIndex(gk_csr_t *mat, int what)
1224 {
1225 /* 'f' stands for forward, 'r' stands for reverse */
1226 ssize_t i, j, k, nf, nr;
1227 ssize_t *fptr, *rptr;
1228 int *find, *rind;
1229 float *fval, *rval;
1230
1231 switch (what) {
1232 case GK_CSR_COL:
1233 nf = mat->nrows;
1234 fptr = mat->rowptr;
1235 find = mat->rowind;
1236 fval = mat->rowval;
1237
1238 if (mat->colptr) gk_free((void **)&mat->colptr, LTERM);
1239 if (mat->colind) gk_free((void **)&mat->colind, LTERM);
1240 if (mat->colval) gk_free((void **)&mat->colval, LTERM);
1241
1242 nr = mat->ncols;
1243 rptr = mat->colptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
1244 rind = mat->colind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
1245 rval = mat->colval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
1246 break;
1247 case GK_CSR_ROW:
1248 nf = mat->ncols;
1249 fptr = mat->colptr;
1250 find = mat->colind;
1251 fval = mat->colval;
1252
1253 if (mat->rowptr) gk_free((void **)&mat->rowptr, LTERM);
1254 if (mat->rowind) gk_free((void **)&mat->rowind, LTERM);
1255 if (mat->rowval) gk_free((void **)&mat->rowval, LTERM);
1256
1257 nr = mat->nrows;
1258 rptr = mat->rowptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
1259 rind = mat->rowind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
1260 rval = mat->rowval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
1261 break;
1262 default:
1263 gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1264 return;
1265 }
1266
1267
1268 for (i=0; i<nf; i++) {
1269 for (j=fptr[i]; j<fptr[i+1]; j++)
1270 rptr[find[j]]++;
1271 }
1272 MAKECSR(i, nr, rptr);
1273
1274 if (rptr[nr] > 6*nr) {
1275 for (i=0; i<nf; i++) {
1276 for (j=fptr[i]; j<fptr[i+1]; j++)
1277 rind[rptr[find[j]]++] = i;
1278 }
1279 SHIFTCSR(i, nr, rptr);
1280
1281 if (fval) {
1282 for (i=0; i<nf; i++) {
1283 for (j=fptr[i]; j<fptr[i+1]; j++)
1284 rval[rptr[find[j]]++] = fval[j];
1285 }
1286 SHIFTCSR(i, nr, rptr);
1287 }
1288 }
1289 else {
1290 if (fval) {
1291 for (i=0; i<nf; i++) {
1292 for (j=fptr[i]; j<fptr[i+1]; j++) {
1293 k = find[j];
1294 rind[rptr[k]] = i;
1295 rval[rptr[k]++] = fval[j];
1296 }
1297 }
1298 }
1299 else {
1300 for (i=0; i<nf; i++) {
1301 for (j=fptr[i]; j<fptr[i+1]; j++)
1302 rind[rptr[find[j]]++] = i;
1303 }
1304 }
1305 SHIFTCSR(i, nr, rptr);
1306 }
1307 }
1308
1309
1310 /*************************************************************************/
1311 /*! Normalizes the rows/columns of the matrix to be unit
1312 length.
1313 \param mat the matrix itself,
1314 \param what indicates what will be normalized and is obtained by
1315 specifying GK_CSR_ROW, GK_CSR_COL, GK_CSR_ROW|GK_CSR_COL.
1316 \param norm indicates what norm is to normalize to, 1: 1-norm, 2: 2-norm
1317 */
1318 /**************************************************************************/
gk_csr_Normalize(gk_csr_t * mat,int what,int norm)1319 void gk_csr_Normalize(gk_csr_t *mat, int what, int norm)
1320 {
1321 ssize_t i, j;
1322 int n;
1323 ssize_t *ptr;
1324 float *val, sum;
1325
1326 if (what&GK_CSR_ROW && mat->rowval) {
1327 n = mat->nrows;
1328 ptr = mat->rowptr;
1329 val = mat->rowval;
1330
1331 #pragma omp parallel if (ptr[n] > OMPMINOPS)
1332 {
1333 #pragma omp for private(j,sum) schedule(static)
1334 for (i=0; i<n; i++) {
1335 for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++){
1336 if (norm == 2)
1337 sum += val[j]*val[j];
1338 else if (norm == 1)
1339 sum += val[j]; /* assume val[j] > 0 */
1340 }
1341 if (sum > 0) {
1342 if (norm == 2)
1343 sum=1.0/sqrt(sum);
1344 else if (norm == 1)
1345 sum=1.0/sum;
1346 for (j=ptr[i]; j<ptr[i+1]; j++)
1347 val[j] *= sum;
1348
1349 }
1350 }
1351 }
1352 }
1353
1354 if (what&GK_CSR_COL && mat->colval) {
1355 n = mat->ncols;
1356 ptr = mat->colptr;
1357 val = mat->colval;
1358
1359 #pragma omp parallel if (ptr[n] > OMPMINOPS)
1360 {
1361 #pragma omp for private(j,sum) schedule(static)
1362 for (i=0; i<n; i++) {
1363 for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++)
1364 if (norm == 2)
1365 sum += val[j]*val[j];
1366 else if (norm == 1)
1367 sum += val[j];
1368 if (sum > 0) {
1369 if (norm == 2)
1370 sum=1.0/sqrt(sum);
1371 else if (norm == 1)
1372 sum=1.0/sum;
1373 for (j=ptr[i]; j<ptr[i+1]; j++)
1374 val[j] *= sum;
1375 }
1376 }
1377 }
1378 }
1379 }
1380
1381
1382 /*************************************************************************/
1383 /*! Applies different row scaling methods.
1384 \param mat the matrix itself,
1385 \param type indicates the type of row scaling. Possible values are:
1386 GK_CSR_MAXTF, GK_CSR_SQRT, GK_CSR_LOG, GK_CSR_IDF, GK_CSR_MAXTF2.
1387 */
1388 /**************************************************************************/
gk_csr_Scale(gk_csr_t * mat,int type)1389 void gk_csr_Scale(gk_csr_t *mat, int type)
1390 {
1391 ssize_t i, j;
1392 int nrows, ncols, nnzcols, bgfreq;
1393 ssize_t *rowptr;
1394 int *rowind, *collen;
1395 float *rowval, *cscale, maxtf;
1396
1397 nrows = mat->nrows;
1398 rowptr = mat->rowptr;
1399 rowind = mat->rowind;
1400 rowval = mat->rowval;
1401
1402 switch (type) {
1403 case GK_CSR_MAXTF: /* TF' = .5 + .5*TF/MAX(TF) */
1404 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1405 {
1406 #pragma omp for private(j, maxtf) schedule(static)
1407 for (i=0; i<nrows; i++) {
1408 maxtf = fabs(rowval[rowptr[i]]);
1409 for (j=rowptr[i]; j<rowptr[i+1]; j++)
1410 maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
1411
1412 for (j=rowptr[i]; j<rowptr[i+1]; j++)
1413 rowval[j] = .5 + .5*rowval[j]/maxtf;
1414 }
1415 }
1416 break;
1417
1418 case GK_CSR_MAXTF2: /* TF' = .1 + .9*TF/MAX(TF) */
1419 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1420 {
1421 #pragma omp for private(j, maxtf) schedule(static)
1422 for (i=0; i<nrows; i++) {
1423 maxtf = fabs(rowval[rowptr[i]]);
1424 for (j=rowptr[i]; j<rowptr[i+1]; j++)
1425 maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
1426
1427 for (j=rowptr[i]; j<rowptr[i+1]; j++)
1428 rowval[j] = .1 + .9*rowval[j]/maxtf;
1429 }
1430 }
1431 break;
1432
1433 case GK_CSR_SQRT: /* TF' = .1+SQRT(TF) */
1434 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1435 {
1436 #pragma omp for private(j) schedule(static)
1437 for (i=0; i<nrows; i++) {
1438 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1439 if (rowval[j] != 0.0)
1440 rowval[j] = .1+sign(rowval[j], sqrt(fabs(rowval[j])));
1441 }
1442 }
1443 }
1444 break;
1445
1446 case GK_CSR_POW25: /* TF' = .1+POW(TF,.25) */
1447 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1448 {
1449 #pragma omp for private(j) schedule(static)
1450 for (i=0; i<nrows; i++) {
1451 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1452 if (rowval[j] != 0.0)
1453 rowval[j] = .1+sign(rowval[j], sqrt(sqrt(fabs(rowval[j]))));
1454 }
1455 }
1456 }
1457 break;
1458
1459 case GK_CSR_POW65: /* TF' = .1+POW(TF,.65) */
1460 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1461 {
1462 #pragma omp for private(j) schedule(static)
1463 for (i=0; i<nrows; i++) {
1464 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1465 if (rowval[j] != 0.0)
1466 rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .65));
1467 }
1468 }
1469 }
1470 break;
1471
1472 case GK_CSR_POW75: /* TF' = .1+POW(TF,.75) */
1473 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1474 {
1475 #pragma omp for private(j) schedule(static)
1476 for (i=0; i<nrows; i++) {
1477 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1478 if (rowval[j] != 0.0)
1479 rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .75));
1480 }
1481 }
1482 }
1483 break;
1484
1485 case GK_CSR_POW85: /* TF' = .1+POW(TF,.85) */
1486 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1487 {
1488 #pragma omp for private(j) schedule(static)
1489 for (i=0; i<nrows; i++) {
1490 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1491 if (rowval[j] != 0.0)
1492 rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .85));
1493 }
1494 }
1495 }
1496 break;
1497
1498 case GK_CSR_LOG: /* TF' = 1+log_2(TF) */
1499 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1500 {
1501 double logscale = 1.0/log(2.0);
1502 #pragma omp for schedule(static,32)
1503 for (i=0; i<rowptr[nrows]; i++) {
1504 if (rowval[i] != 0.0)
1505 rowval[i] = 1+(rowval[i]>0.0 ? log(rowval[i]) : -log(-rowval[i]))*logscale;
1506 }
1507 #ifdef XXX
1508 #pragma omp for private(j) schedule(static)
1509 for (i=0; i<nrows; i++) {
1510 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1511 if (rowval[j] != 0.0)
1512 rowval[j] = 1+(rowval[j]>0.0 ? log(rowval[j]) : -log(-rowval[j]))*logscale;
1513 //rowval[j] = 1+sign(rowval[j], log(fabs(rowval[j]))*logscale);
1514 }
1515 }
1516 #endif
1517 }
1518 break;
1519
1520 case GK_CSR_IDF: /* TF' = TF*IDF */
1521 ncols = mat->ncols;
1522 cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
1523 collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
1524
1525 for (i=0; i<nrows; i++) {
1526 for (j=rowptr[i]; j<rowptr[i+1]; j++)
1527 collen[rowind[j]]++;
1528 }
1529
1530 #pragma omp parallel if (ncols > OMPMINOPS)
1531 {
1532 #pragma omp for schedule(static)
1533 for (i=0; i<ncols; i++)
1534 cscale[i] = (collen[i] > 0 ? log(1.0*nrows/collen[i]) : 0.0);
1535 }
1536
1537 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1538 {
1539 #pragma omp for private(j) schedule(static)
1540 for (i=0; i<nrows; i++) {
1541 for (j=rowptr[i]; j<rowptr[i+1]; j++)
1542 rowval[j] *= cscale[rowind[j]];
1543 }
1544 }
1545
1546 gk_free((void **)&cscale, &collen, LTERM);
1547 break;
1548
1549 case GK_CSR_IDF2: /* TF' = TF*IDF */
1550 ncols = mat->ncols;
1551 cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
1552 collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
1553
1554 for (i=0; i<nrows; i++) {
1555 for (j=rowptr[i]; j<rowptr[i+1]; j++)
1556 collen[rowind[j]]++;
1557 }
1558
1559 nnzcols = 0;
1560 #pragma omp parallel if (ncols > OMPMINOPS)
1561 {
1562 #pragma omp for schedule(static) reduction(+:nnzcols)
1563 for (i=0; i<ncols; i++)
1564 nnzcols += (collen[i] > 0 ? 1 : 0);
1565
1566 bgfreq = gk_max(10, (ssize_t)(.5*rowptr[nrows]/nnzcols));
1567 printf("nnz: %zd, nnzcols: %d, bgfreq: %d\n", rowptr[nrows], nnzcols, bgfreq);
1568
1569 #pragma omp for schedule(static)
1570 for (i=0; i<ncols; i++)
1571 cscale[i] = (collen[i] > 0 ? log(1.0*(nrows+2*bgfreq)/(bgfreq+collen[i])) : 0.0);
1572 }
1573
1574 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1575 {
1576 #pragma omp for private(j) schedule(static)
1577 for (i=0; i<nrows; i++) {
1578 for (j=rowptr[i]; j<rowptr[i+1]; j++)
1579 rowval[j] *= cscale[rowind[j]];
1580 }
1581 }
1582
1583 gk_free((void **)&cscale, &collen, LTERM);
1584 break;
1585
1586 default:
1587 gk_errexit(SIGERR, "Unknown scaling type of %d\n", type);
1588 }
1589
1590 }
1591
1592
1593 /*************************************************************************/
1594 /*! Computes the sums of the rows/columns
1595 \param mat the matrix itself,
1596 \param what is either GK_CSR_ROW or GK_CSR_COL indicating which
1597 sums to compute.
1598 */
1599 /**************************************************************************/
gk_csr_ComputeSums(gk_csr_t * mat,int what)1600 void gk_csr_ComputeSums(gk_csr_t *mat, int what)
1601 {
1602 ssize_t i;
1603 int n;
1604 ssize_t *ptr;
1605 float *val, *sums;
1606
1607 switch (what) {
1608 case GK_CSR_ROW:
1609 n = mat->nrows;
1610 ptr = mat->rowptr;
1611 val = mat->rowval;
1612
1613 if (mat->rsums)
1614 gk_free((void **)&mat->rsums, LTERM);
1615
1616 sums = mat->rsums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
1617 break;
1618 case GK_CSR_COL:
1619 n = mat->ncols;
1620 ptr = mat->colptr;
1621 val = mat->colval;
1622
1623 if (mat->csums)
1624 gk_free((void **)&mat->csums, LTERM);
1625
1626 sums = mat->csums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
1627 break;
1628 default:
1629 gk_errexit(SIGERR, "Invalid sum type of %d.\n", what);
1630 return;
1631 }
1632
1633 #pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
1634 for (i=0; i<n; i++)
1635 sums[i] = gk_fsum(ptr[i+1]-ptr[i], val+ptr[i], 1);
1636 }
1637
1638
1639 /*************************************************************************/
1640 /*! Computes the squared of the norms of the rows/columns
1641 \param mat the matrix itself,
1642 \param what is either GK_CSR_ROW or GK_CSR_COL indicating which
1643 squared norms to compute.
1644 */
1645 /**************************************************************************/
gk_csr_ComputeSquaredNorms(gk_csr_t * mat,int what)1646 void gk_csr_ComputeSquaredNorms(gk_csr_t *mat, int what)
1647 {
1648 ssize_t i;
1649 int n;
1650 ssize_t *ptr;
1651 float *val, *norms;
1652
1653 switch (what) {
1654 case GK_CSR_ROW:
1655 n = mat->nrows;
1656 ptr = mat->rowptr;
1657 val = mat->rowval;
1658
1659 if (mat->rnorms) gk_free((void **)&mat->rnorms, LTERM);
1660
1661 norms = mat->rnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
1662 break;
1663 case GK_CSR_COL:
1664 n = mat->ncols;
1665 ptr = mat->colptr;
1666 val = mat->colval;
1667
1668 if (mat->cnorms) gk_free((void **)&mat->cnorms, LTERM);
1669
1670 norms = mat->cnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
1671 break;
1672 default:
1673 gk_errexit(SIGERR, "Invalid norm type of %d.\n", what);
1674 return;
1675 }
1676
1677 #pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
1678 for (i=0; i<n; i++)
1679 norms[i] = gk_fdot(ptr[i+1]-ptr[i], val+ptr[i], 1, val+ptr[i], 1);
1680 }
1681
1682
1683 /*************************************************************************/
1684 /*! Computes the similarity between two rows/columns
1685
1686 \param mat the matrix itself. The routine assumes that the indices
1687 are sorted in increasing order.
1688 \param i1 is the first row/column,
1689 \param i2 is the second row/column,
1690 \param what is either GK_CSR_ROW or GK_CSR_COL indicating the type of
1691 objects between the similarity will be computed,
1692 \param simtype is the type of similarity and is one of GK_CSR_COS,
1693 GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN
1694 \returns the similarity between the two rows/columns.
1695 */
1696 /**************************************************************************/
gk_csr_ComputeSimilarity(gk_csr_t * mat,int i1,int i2,int what,int simtype)1697 float gk_csr_ComputeSimilarity(gk_csr_t *mat, int i1, int i2, int what, int simtype)
1698 {
1699 int nind1, nind2;
1700 int *ind1, *ind2;
1701 float *val1, *val2, stat1, stat2, sim;
1702
1703 switch (what) {
1704 case GK_CSR_ROW:
1705 if (!mat->rowptr)
1706 gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
1707 nind1 = mat->rowptr[i1+1]-mat->rowptr[i1];
1708 nind2 = mat->rowptr[i2+1]-mat->rowptr[i2];
1709 ind1 = mat->rowind + mat->rowptr[i1];
1710 ind2 = mat->rowind + mat->rowptr[i2];
1711 val1 = mat->rowval + mat->rowptr[i1];
1712 val2 = mat->rowval + mat->rowptr[i2];
1713 break;
1714
1715 case GK_CSR_COL:
1716 if (!mat->colptr)
1717 gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
1718 nind1 = mat->colptr[i1+1]-mat->colptr[i1];
1719 nind2 = mat->colptr[i2+1]-mat->colptr[i2];
1720 ind1 = mat->colind + mat->colptr[i1];
1721 ind2 = mat->colind + mat->colptr[i2];
1722 val1 = mat->colval + mat->colptr[i1];
1723 val2 = mat->colval + mat->colptr[i2];
1724 break;
1725
1726 default:
1727 gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1728 return 0.0;
1729 }
1730
1731
1732 switch (simtype) {
1733 case GK_CSR_COS:
1734 case GK_CSR_JAC:
1735 sim = stat1 = stat2 = 0.0;
1736 i1 = i2 = 0;
1737 while (i1<nind1 && i2<nind2) {
1738 if (i1 == nind1) {
1739 stat2 += val2[i2]*val2[i2];
1740 i2++;
1741 }
1742 else if (i2 == nind2) {
1743 stat1 += val1[i1]*val1[i1];
1744 i1++;
1745 }
1746 else if (ind1[i1] < ind2[i2]) {
1747 stat1 += val1[i1]*val1[i1];
1748 i1++;
1749 }
1750 else if (ind1[i1] > ind2[i2]) {
1751 stat2 += val2[i2]*val2[i2];
1752 i2++;
1753 }
1754 else {
1755 sim += val1[i1]*val2[i2];
1756 stat1 += val1[i1]*val1[i1];
1757 stat2 += val2[i2]*val2[i2];
1758 i1++;
1759 i2++;
1760 }
1761 }
1762 if (simtype == GK_CSR_COS)
1763 sim = (stat1*stat2 > 0.0 ? sim/sqrt(stat1*stat2) : 0.0);
1764 else
1765 sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
1766 break;
1767
1768 case GK_CSR_MIN:
1769 sim = stat1 = stat2 = 0.0;
1770 i1 = i2 = 0;
1771 while (i1<nind1 && i2<nind2) {
1772 if (i1 == nind1) {
1773 stat2 += val2[i2];
1774 i2++;
1775 }
1776 else if (i2 == nind2) {
1777 stat1 += val1[i1];
1778 i1++;
1779 }
1780 else if (ind1[i1] < ind2[i2]) {
1781 stat1 += val1[i1];
1782 i1++;
1783 }
1784 else if (ind1[i1] > ind2[i2]) {
1785 stat2 += val2[i2];
1786 i2++;
1787 }
1788 else {
1789 sim += gk_min(val1[i1],val2[i2]);
1790 stat1 += val1[i1];
1791 stat2 += val2[i2];
1792 i1++;
1793 i2++;
1794 }
1795 }
1796 sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
1797
1798 break;
1799
1800 case GK_CSR_AMIN:
1801 sim = stat1 = stat2 = 0.0;
1802 i1 = i2 = 0;
1803 while (i1<nind1 && i2<nind2) {
1804 if (i1 == nind1) {
1805 stat2 += val2[i2];
1806 i2++;
1807 }
1808 else if (i2 == nind2) {
1809 stat1 += val1[i1];
1810 i1++;
1811 }
1812 else if (ind1[i1] < ind2[i2]) {
1813 stat1 += val1[i1];
1814 i1++;
1815 }
1816 else if (ind1[i1] > ind2[i2]) {
1817 stat2 += val2[i2];
1818 i2++;
1819 }
1820 else {
1821 sim += gk_min(val1[i1],val2[i2]);
1822 stat1 += val1[i1];
1823 stat2 += val2[i2];
1824 i1++;
1825 i2++;
1826 }
1827 }
1828 sim = (stat1 > 0.0 ? sim/stat1 : 0.0);
1829
1830 break;
1831
1832 default:
1833 gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
1834 return -1;
1835 }
1836
1837 return sim;
1838
1839 }
1840
1841
1842 /*************************************************************************/
1843 /*! Finds the n most similar rows (neighbors) to the query using cosine
1844 similarity.
1845
1846 \param mat the matrix itself
1847 \param nqterms is the number of columns in the query
1848 \param qind is the list of query columns
1849 \param qval is the list of correspodning query weights
1850 \param simtype is the type of similarity and is one of GK_CSR_COS,
1851 GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN
1852 \param nsim is the maximum number of requested most similar rows.
1853 If -1 is provided, then everything is returned unsorted.
1854 \param minsim is the minimum similarity of the requested most
1855 similar rows
1856 \param hits is the result set. This array should be at least
1857 of length nsim.
1858 \param i_marker is an array of size equal to the number of rows
1859 whose values are initialized to -1. If NULL is provided
1860 then this array is allocated and freed internally.
1861 \param i_cand is an array of size equal to the number of rows.
1862 If NULL is provided then this array is allocated and freed
1863 internally.
1864 \returns the number of identified most similar rows, which can be
1865 smaller than the requested number of nnbrs in those cases
1866 in which there are no sufficiently many neighbors.
1867 */
1868 /**************************************************************************/
gk_csr_GetSimilarRows(gk_csr_t * mat,int nqterms,int * qind,float * qval,int simtype,int nsim,float minsim,gk_fkv_t * hits,int * i_marker,gk_fkv_t * i_cand)1869 int gk_csr_GetSimilarRows(gk_csr_t *mat, int nqterms, int *qind,
1870 float *qval, int simtype, int nsim, float minsim, gk_fkv_t *hits,
1871 int *i_marker, gk_fkv_t *i_cand)
1872 {
1873 ssize_t i, ii, j, k;
1874 int nrows, ncols, ncand;
1875 ssize_t *colptr;
1876 int *colind, *marker;
1877 float *colval, *rnorms, mynorm, *rsums, mysum;
1878 gk_fkv_t *cand;
1879
1880 if (nqterms == 0)
1881 return 0;
1882
1883 nrows = mat->nrows;
1884 ncols = mat->ncols;
1885 colptr = mat->colptr;
1886 colind = mat->colind;
1887 colval = mat->colval;
1888
1889 marker = (i_marker ? i_marker : gk_ismalloc(nrows, -1, "gk_csr_SimilarRows: marker"));
1890 cand = (i_cand ? i_cand : gk_fkvmalloc(nrows, "gk_csr_SimilarRows: cand"));
1891
1892 switch (simtype) {
1893 case GK_CSR_COS:
1894 for (ncand=0, ii=0; ii<nqterms; ii++) {
1895 i = qind[ii];
1896 if (i < ncols) {
1897 for (j=colptr[i]; j<colptr[i+1]; j++) {
1898 k = colind[j];
1899 if (marker[k] == -1) {
1900 cand[ncand].val = k;
1901 cand[ncand].key = 0;
1902 marker[k] = ncand++;
1903 }
1904 cand[marker[k]].key += colval[j]*qval[ii];
1905 }
1906 }
1907 }
1908 break;
1909
1910 case GK_CSR_JAC:
1911 for (ncand=0, ii=0; ii<nqterms; ii++) {
1912 i = qind[ii];
1913 if (i < ncols) {
1914 for (j=colptr[i]; j<colptr[i+1]; j++) {
1915 k = colind[j];
1916 if (marker[k] == -1) {
1917 cand[ncand].val = k;
1918 cand[ncand].key = 0;
1919 marker[k] = ncand++;
1920 }
1921 cand[marker[k]].key += colval[j]*qval[ii];
1922 }
1923 }
1924 }
1925
1926 rnorms = mat->rnorms;
1927 mynorm = gk_fdot(nqterms, qval, 1, qval, 1);
1928
1929 for (i=0; i<ncand; i++)
1930 cand[i].key = cand[i].key/(rnorms[cand[i].val]+mynorm-cand[i].key);
1931 break;
1932
1933 case GK_CSR_MIN:
1934 for (ncand=0, ii=0; ii<nqterms; ii++) {
1935 i = qind[ii];
1936 if (i < ncols) {
1937 for (j=colptr[i]; j<colptr[i+1]; j++) {
1938 k = colind[j];
1939 if (marker[k] == -1) {
1940 cand[ncand].val = k;
1941 cand[ncand].key = 0;
1942 marker[k] = ncand++;
1943 }
1944 cand[marker[k]].key += gk_min(colval[j], qval[ii]);
1945 }
1946 }
1947 }
1948
1949 rsums = mat->rsums;
1950 mysum = gk_fsum(nqterms, qval, 1);
1951
1952 for (i=0; i<ncand; i++)
1953 cand[i].key = cand[i].key/(rsums[cand[i].val]+mysum-cand[i].key);
1954 break;
1955
1956 /* Assymetric MIN similarity */
1957 case GK_CSR_AMIN:
1958 for (ncand=0, ii=0; ii<nqterms; ii++) {
1959 i = qind[ii];
1960 if (i < ncols) {
1961 for (j=colptr[i]; j<colptr[i+1]; j++) {
1962 k = colind[j];
1963 if (marker[k] == -1) {
1964 cand[ncand].val = k;
1965 cand[ncand].key = 0;
1966 marker[k] = ncand++;
1967 }
1968 cand[marker[k]].key += gk_min(colval[j], qval[ii]);
1969 }
1970 }
1971 }
1972
1973 mysum = gk_fsum(nqterms, qval, 1);
1974
1975 for (i=0; i<ncand; i++)
1976 cand[i].key = cand[i].key/mysum;
1977 break;
1978
1979 default:
1980 gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
1981 return -1;
1982 }
1983
1984 /* go and prune the hits that are bellow minsim */
1985 for (j=0, i=0; i<ncand; i++) {
1986 marker[cand[i].val] = -1;
1987 if (cand[i].key >= minsim)
1988 cand[j++] = cand[i];
1989 }
1990 ncand = j;
1991
1992 if (nsim == -1 || nsim >= ncand) {
1993 nsim = ncand;
1994 }
1995 else {
1996 nsim = gk_min(nsim, ncand);
1997 gk_dfkvkselect(ncand, nsim, cand);
1998 gk_fkvsortd(nsim, cand);
1999 }
2000
2001 gk_fkvcopy(nsim, cand, hits);
2002
2003 if (i_marker == NULL)
2004 gk_free((void **)&marker, LTERM);
2005 if (i_cand == NULL)
2006 gk_free((void **)&cand, LTERM);
2007
2008 return nsim;
2009 }
2010
2011