1 /*!
2  * \file
3  *
4  * \brief Various routines with dealing with CSR matrices
5  *
6  * \author George Karypis
7  * \version\verbatim $Id: csr.c 13437 2013-01-11 21:54:10Z karypis $ \endverbatim
8  */
9 
10 #include <GKlib.h>
11 
12 #define OMPMINOPS       50000
13 
14 /*************************************************************************/
15 /*! Allocate memory for a CSR matrix and initializes it
16     \returns the allocated matrix. The various fields are set to NULL.
17 */
18 /**************************************************************************/
gk_csr_Create()19 gk_csr_t *gk_csr_Create()
20 {
21   gk_csr_t *mat;
22 
23   mat = (gk_csr_t *)gk_malloc(sizeof(gk_csr_t), "gk_csr_Create: mat");
24 
25   gk_csr_Init(mat);
26 
27   return mat;
28 }
29 
30 
31 /*************************************************************************/
32 /*! Initializes the matrix
33     \param mat is the matrix to be initialized.
34 */
35 /*************************************************************************/
gk_csr_Init(gk_csr_t * mat)36 void gk_csr_Init(gk_csr_t *mat)
37 {
38   memset(mat, 0, sizeof(gk_csr_t));
39   mat->nrows = mat->ncols = -1;
40 }
41 
42 
43 /*************************************************************************/
44 /*! Frees all the memory allocated for matrix.
45     \param mat is the matrix to be freed.
46 */
47 /*************************************************************************/
gk_csr_Free(gk_csr_t ** mat)48 void gk_csr_Free(gk_csr_t **mat)
49 {
50   if (*mat == NULL)
51     return;
52   gk_csr_FreeContents(*mat);
53   gk_free((void **)mat, LTERM);
54 }
55 
56 
57 /*************************************************************************/
58 /*! Frees only the memory allocated for the matrix's different fields and
59     sets them to NULL.
60     \param mat is the matrix whose contents will be freed.
61 */
62 /*************************************************************************/
gk_csr_FreeContents(gk_csr_t * mat)63 void gk_csr_FreeContents(gk_csr_t *mat)
64 {
65   gk_free((void *)&mat->rowptr, &mat->rowind, &mat->rowval, &mat->rowids,
66           &mat->colptr, &mat->colind, &mat->colval, &mat->colids,
67           &mat->rnorms, &mat->cnorms, &mat->rsums, &mat->csums,
68           &mat->rsizes, &mat->csizes, &mat->rvols, &mat->cvols,
69           &mat->rwgts, &mat->cwgts,
70           LTERM);
71 }
72 
73 
74 /*************************************************************************/
75 /*! Returns a copy of a matrix.
76     \param mat is the matrix to be duplicated.
77     \returns the newly created copy of the matrix.
78 */
79 /**************************************************************************/
gk_csr_Dup(gk_csr_t * mat)80 gk_csr_t *gk_csr_Dup(gk_csr_t *mat)
81 {
82   gk_csr_t *nmat;
83 
84   nmat = gk_csr_Create();
85 
86   nmat->nrows  = mat->nrows;
87   nmat->ncols  = mat->ncols;
88 
89   /* copy the row structure */
90   if (mat->rowptr)
91     nmat->rowptr = gk_zcopy(mat->nrows+1, mat->rowptr,
92                             gk_zmalloc(mat->nrows+1, "gk_csr_Dup: rowptr"));
93   if (mat->rowids)
94     nmat->rowids = gk_icopy(mat->nrows, mat->rowids,
95                             gk_imalloc(mat->nrows, "gk_csr_Dup: rowids"));
96   if (mat->rnorms)
97     nmat->rnorms = gk_fcopy(mat->nrows, mat->rnorms,
98                             gk_fmalloc(mat->nrows, "gk_csr_Dup: rnorms"));
99   if (mat->rowind)
100     nmat->rowind = gk_icopy(mat->rowptr[mat->nrows], mat->rowind,
101                             gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowind"));
102   if (mat->rowval)
103     nmat->rowval = gk_fcopy(mat->rowptr[mat->nrows], mat->rowval,
104                             gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowval"));
105 
106   /* copy the col structure */
107   if (mat->colptr)
108     nmat->colptr = gk_zcopy(mat->ncols+1, mat->colptr,
109                             gk_zmalloc(mat->ncols+1, "gk_csr_Dup: colptr"));
110   if (mat->colids)
111     nmat->colids = gk_icopy(mat->ncols, mat->colids,
112                             gk_imalloc(mat->ncols, "gk_csr_Dup: colids"));
113   if (mat->cnorms)
114     nmat->cnorms = gk_fcopy(mat->ncols, mat->cnorms,
115                             gk_fmalloc(mat->ncols, "gk_csr_Dup: cnorms"));
116   if (mat->colind)
117     nmat->colind = gk_icopy(mat->colptr[mat->ncols], mat->colind,
118                             gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colind"));
119   if (mat->colval)
120     nmat->colval = gk_fcopy(mat->colptr[mat->ncols], mat->colval,
121                             gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colval"));
122 
123   return nmat;
124 }
125 
126 
127 /*************************************************************************/
128 /*! Returns a submatrix containint a set of consecutive rows.
129     \param mat is the original matrix.
130     \param rstart is the starting row.
131     \param nrows is the number of rows from rstart to extract.
132     \returns the row structure of the newly created submatrix.
133 */
134 /**************************************************************************/
gk_csr_ExtractSubmatrix(gk_csr_t * mat,int rstart,int nrows)135 gk_csr_t *gk_csr_ExtractSubmatrix(gk_csr_t *mat, int rstart, int nrows)
136 {
137   ssize_t i;
138   gk_csr_t *nmat;
139 
140   if (rstart+nrows > mat->nrows)
141     return NULL;
142 
143   nmat = gk_csr_Create();
144 
145   nmat->nrows  = nrows;
146   nmat->ncols  = mat->ncols;
147 
148   /* copy the row structure */
149   if (mat->rowptr)
150     nmat->rowptr = gk_zcopy(nrows+1, mat->rowptr+rstart,
151                               gk_zmalloc(nrows+1, "gk_csr_ExtractSubmatrix: rowptr"));
152   for (i=nrows; i>=0; i--)
153     nmat->rowptr[i] -= nmat->rowptr[0];
154   ASSERT(nmat->rowptr[0] == 0);
155 
156   if (mat->rowids)
157     nmat->rowids = gk_icopy(nrows, mat->rowids+rstart,
158                             gk_imalloc(nrows, "gk_csr_ExtractSubmatrix: rowids"));
159   if (mat->rnorms)
160     nmat->rnorms = gk_fcopy(nrows, mat->rnorms+rstart,
161                             gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rnorms"));
162 
163   if (mat->rsums)
164     nmat->rsums = gk_fcopy(nrows, mat->rsums+rstart,
165                             gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rsums"));
166 
167   ASSERT(nmat->rowptr[nrows] == mat->rowptr[rstart+nrows]-mat->rowptr[rstart]);
168   if (mat->rowind)
169     nmat->rowind = gk_icopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
170                             mat->rowind+mat->rowptr[rstart],
171                             gk_imalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
172                                        "gk_csr_ExtractSubmatrix: rowind"));
173   if (mat->rowval)
174     nmat->rowval = gk_fcopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
175                             mat->rowval+mat->rowptr[rstart],
176                             gk_fmalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
177                                        "gk_csr_ExtractSubmatrix: rowval"));
178 
179   return nmat;
180 }
181 
182 
183 /*************************************************************************/
184 /*! Returns a submatrix containing a certain set of rows.
185     \param mat is the original matrix.
186     \param nrows is the number of rows to extract.
187     \param rind is the set of row numbers to extract.
188     \returns the row structure of the newly created submatrix.
189 */
190 /**************************************************************************/
gk_csr_ExtractRows(gk_csr_t * mat,int nrows,int * rind)191 gk_csr_t *gk_csr_ExtractRows(gk_csr_t *mat, int nrows, int *rind)
192 {
193   ssize_t i, ii, j, nnz;
194   gk_csr_t *nmat;
195 
196   nmat = gk_csr_Create();
197 
198   nmat->nrows = nrows;
199   nmat->ncols = mat->ncols;
200 
201   for (nnz=0, i=0; i<nrows; i++)
202     nnz += mat->rowptr[rind[i]+1]-mat->rowptr[rind[i]];
203 
204   nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
205   nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
206   nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
207 
208   nmat->rowptr[0] = 0;
209   for (nnz=0, j=0, ii=0; ii<nrows; ii++) {
210     i = rind[ii];
211     gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
212     gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
213     nnz += mat->rowptr[i+1]-mat->rowptr[i];
214     nmat->rowptr[++j] = nnz;
215   }
216   ASSERT(j == nmat->nrows);
217 
218   return nmat;
219 }
220 
221 
222 /*************************************************************************/
223 /*! Returns a submatrix corresponding to a specified partitioning of rows.
224     \param mat is the original matrix.
225     \param part is the partitioning vector of the rows.
226     \param pid is the partition ID that will be extracted.
227     \returns the row structure of the newly created submatrix.
228 */
229 /**************************************************************************/
gk_csr_ExtractPartition(gk_csr_t * mat,int * part,int pid)230 gk_csr_t *gk_csr_ExtractPartition(gk_csr_t *mat, int *part, int pid)
231 {
232   ssize_t i, j, nnz;
233   gk_csr_t *nmat;
234 
235   nmat = gk_csr_Create();
236 
237   nmat->nrows = 0;
238   nmat->ncols = mat->ncols;
239 
240   for (nnz=0, i=0; i<mat->nrows; i++) {
241     if (part[i] == pid) {
242       nmat->nrows++;
243       nnz += mat->rowptr[i+1]-mat->rowptr[i];
244     }
245   }
246 
247   nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
248   nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
249   nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
250 
251   nmat->rowptr[0] = 0;
252   for (nnz=0, j=0, i=0; i<mat->nrows; i++) {
253     if (part[i] == pid) {
254       gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
255       gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
256       nnz += mat->rowptr[i+1]-mat->rowptr[i];
257       nmat->rowptr[++j] = nnz;
258     }
259   }
260   ASSERT(j == nmat->nrows);
261 
262   return nmat;
263 }
264 
265 
266 /*************************************************************************/
267 /*! Splits the matrix into multiple sub-matrices based on the provided
268     color array.
269     \param mat is the original matrix.
270     \param color is an array of size equal to the number of non-zeros
271            in the matrix (row-wise structure). The matrix is split into
272            as many parts as the number of colors. For meaningfull results,
273            the colors should be numbered consecutively starting from 0.
274     \returns an array of matrices for each supplied color number.
275 */
276 /**************************************************************************/
gk_csr_Split(gk_csr_t * mat,int * color)277 gk_csr_t **gk_csr_Split(gk_csr_t *mat, int *color)
278 {
279   ssize_t i, j;
280   int nrows, ncolors;
281   ssize_t *rowptr;
282   int *rowind;
283   float *rowval;
284   gk_csr_t **smats;
285 
286   nrows  = mat->nrows;
287   rowptr = mat->rowptr;
288   rowind = mat->rowind;
289   rowval = mat->rowval;
290 
291   ncolors = gk_imax(rowptr[nrows], color)+1;
292 
293   smats = (gk_csr_t **)gk_malloc(sizeof(gk_csr_t *)*ncolors, "gk_csr_Split: smats");
294   for (i=0; i<ncolors; i++) {
295     smats[i] = gk_csr_Create();
296     smats[i]->nrows  = mat->nrows;
297     smats[i]->ncols  = mat->ncols;
298     smats[i]->rowptr = gk_zsmalloc(nrows+1, 0, "gk_csr_Split: smats[i]->rowptr");
299   }
300 
301   for (i=0; i<nrows; i++) {
302     for (j=rowptr[i]; j<rowptr[i+1]; j++)
303       smats[color[j]]->rowptr[i]++;
304   }
305   for (i=0; i<ncolors; i++)
306     MAKECSR(j, nrows, smats[i]->rowptr);
307 
308   for (i=0; i<ncolors; i++) {
309     smats[i]->rowind = gk_imalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowind");
310     smats[i]->rowval = gk_fmalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowval");
311   }
312 
313   for (i=0; i<nrows; i++) {
314     for (j=rowptr[i]; j<rowptr[i+1]; j++) {
315       smats[color[j]]->rowind[smats[color[j]]->rowptr[i]] = rowind[j];
316       smats[color[j]]->rowval[smats[color[j]]->rowptr[i]] = rowval[j];
317       smats[color[j]]->rowptr[i]++;
318     }
319   }
320 
321   for (i=0; i<ncolors; i++)
322     SHIFTCSR(j, nrows, smats[i]->rowptr);
323 
324   return smats;
325 }
326 
327 
328 /**************************************************************************/
329 /*! Reads a CSR matrix from the supplied file and stores it the matrix's
330     forward structure.
331     \param filename is the file that stores the data.
332     \param format is either GK_CSR_FMT_METIS, GK_CSR_FMT_CLUTO,
333            GK_CSR_FMT_CSR, GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL
334            specifying the type of the input format.
335            The GK_CSR_FMT_CSR does not contain a header
336            line, whereas the GK_CSR_FMT_BINROW is a binary format written
337            by gk_csr_Write() using the same format specifier.
338     \param readvals is either 1 or 0, indicating if the CSR file contains
339            values or it does not. It only applies when GK_CSR_FMT_CSR is
340            used.
341     \param numbering is either 1 or 0, indicating if the numbering of the
342            indices start from 1 or 0, respectively. If they start from 1,
343            they are automatically decreamented during input so that they
344            will start from 0. It only applies when GK_CSR_FMT_CSR is
345            used.
346     \returns the matrix that was read.
347 */
348 /**************************************************************************/
gk_csr_Read(char * filename,int format,int readvals,int numbering)349 gk_csr_t *gk_csr_Read(char *filename, int format, int readvals, int numbering)
350 {
351   ssize_t i, k, l;
352   size_t nfields, nrows, ncols, nnz, fmt, ncon;
353   size_t lnlen;
354   ssize_t *rowptr;
355   int *rowind, ival;
356   float *rowval=NULL, fval;
357   int readsizes, readwgts;
358   char *line=NULL, *head, *tail, fmtstr[256];
359   FILE *fpin;
360   gk_csr_t *mat=NULL;
361 
362 
363   if (!gk_fexists(filename))
364     gk_errexit(SIGERR, "File %s does not exist!\n", filename);
365 
366   if (format == GK_CSR_FMT_BINROW) {
367     mat = gk_csr_Create();
368 
369     fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
370     if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
371       gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
372     if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
373       gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
374     mat->rowptr = gk_zmalloc(mat->nrows+1, "gk_csr_Read: rowptr");
375     if (fread(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpin) != mat->nrows+1)
376       gk_errexit(SIGERR, "Failed to read the rowptr from file %s!\n", filename);
377     mat->rowind = gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowind");
378     if (fread(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
379       gk_errexit(SIGERR, "Failed to read the rowind from file %s!\n", filename);
380     if (readvals == 1) {
381       mat->rowval = gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowval");
382       if (fread(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
383         gk_errexit(SIGERR, "Failed to read the rowval from file %s!\n", filename);
384     }
385 
386     gk_fclose(fpin);
387     return mat;
388   }
389 
390   if (format == GK_CSR_FMT_BINCOL) {
391     mat = gk_csr_Create();
392 
393     fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
394     if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
395       gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
396     if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
397       gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
398     mat->colptr = gk_zmalloc(mat->ncols+1, "gk_csr_Read: colptr");
399     if (fread(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpin) != mat->ncols+1)
400       gk_errexit(SIGERR, "Failed to read the colptr from file %s!\n", filename);
401     mat->colind = gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Read: colind");
402     if (fread(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
403       gk_errexit(SIGERR, "Failed to read the colind from file %s!\n", filename);
404     if (readvals) {
405       mat->colval = gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Read: colval");
406       if (fread(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
407         gk_errexit(SIGERR, "Failed to read the colval from file %s!\n", filename);
408     }
409 
410     gk_fclose(fpin);
411     return mat;
412   }
413 
414 
415   if (format == GK_CSR_FMT_CLUTO) {
416     fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
417     do {
418       if (gk_getline(&line, &lnlen, fpin) <= 0)
419         gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
420     } while (line[0] == '%');
421 
422     if (sscanf(line, "%zu %zu %zu", &nrows, &ncols, &nnz) != 3)
423       gk_errexit(SIGERR, "Header line must contain 3 integers.\n");
424 
425     readsizes = 0;
426     readwgts  = 0;
427     readvals  = 1;
428     numbering = 1;
429   }
430   else if (format == GK_CSR_FMT_METIS) {
431     fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
432     do {
433       if (gk_getline(&line, &lnlen, fpin) <= 0)
434         gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
435     } while (line[0] == '%');
436 
437     fmt = ncon = 0;
438     nfields = sscanf(line, "%zu %zu %zu %zu", &nrows, &nnz, &fmt, &ncon);
439     if (nfields < 2)
440       gk_errexit(SIGERR, "Header line must contain at least 2 integers (#vtxs and #edges).\n");
441 
442     ncols = nrows;
443     nnz *= 2;
444 
445     if (fmt > 111)
446       gk_errexit(SIGERR, "Cannot read this type of file format [fmt=%zu]!\n", fmt);
447 
448     sprintf(fmtstr, "%03zu", fmt%1000);
449     readsizes = (fmtstr[0] == '1');
450     readwgts  = (fmtstr[1] == '1');
451     readvals  = (fmtstr[2] == '1');
452     numbering = 1;
453     ncon      = (ncon == 0 ? 1 : ncon);
454   }
455   else {
456     readsizes = 0;
457     readwgts  = 0;
458 
459     gk_getfilestats(filename, &nrows, &nnz, NULL, NULL);
460 
461     if (readvals == 1 && nnz%2 == 1)
462       gk_errexit(SIGERR, "Error: The number of numbers (%zd %d) in the input file is not even.\n", nnz, readvals);
463     if (readvals == 1)
464       nnz = nnz/2;
465     fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
466   }
467 
468   mat = gk_csr_Create();
469 
470   mat->nrows = nrows;
471 
472   rowptr = mat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Read: rowptr");
473   rowind = mat->rowind = gk_imalloc(nnz, "gk_csr_Read: rowind");
474   if (readvals != 2)
475     rowval = mat->rowval = gk_fsmalloc(nnz, 1.0, "gk_csr_Read: rowval");
476 
477   if (readsizes)
478     mat->rsizes = gk_fsmalloc(nrows, 0.0, "gk_csr_Read: rsizes");
479 
480   if (readwgts)
481     mat->rwgts = gk_fsmalloc(nrows*ncon, 0.0, "gk_csr_Read: rwgts");
482 
483   /*----------------------------------------------------------------------
484    * Read the sparse matrix file
485    *---------------------------------------------------------------------*/
486   numbering = (numbering ? - 1 : 0);
487   for (ncols=0, rowptr[0]=0, k=0, i=0; i<nrows; i++) {
488     do {
489       if (gk_getline(&line, &lnlen, fpin) == -1)
490         gk_errexit(SIGERR, "Premature end of input file: file while reading row %d\n", i);
491     } while (line[0] == '%');
492 
493     head = line;
494     tail = NULL;
495 
496     /* Read vertex sizes */
497     if (readsizes) {
498 #ifdef __MSC__
499       mat->rsizes[i] = (float)strtod(head, &tail);
500 #else
501       mat->rsizes[i] = strtof(head, &tail);
502 #endif
503       if (tail == head)
504         gk_errexit(SIGERR, "The line for vertex %zd does not have size information\n", i+1);
505       if (mat->rsizes[i] < 0)
506         errexit("The size for vertex %zd must be >= 0\n", i+1);
507       head = tail;
508     }
509 
510     /* Read vertex weights */
511     if (readwgts) {
512       for (l=0; l<ncon; l++) {
513 #ifdef __MSC__
514         mat->rwgts[i*ncon+l] = (float)strtod(head, &tail);
515 #else
516         mat->rwgts[i*ncon+l] = strtof(head, &tail);
517 #endif
518         if (tail == head)
519           errexit("The line for vertex %zd does not have enough weights "
520                   "for the %d constraints.\n", i+1, ncon);
521         if (mat->rwgts[i*ncon+l] < 0)
522           errexit("The weight vertex %zd and constraint %zd must be >= 0\n", i+1, l);
523         head = tail;
524       }
525     }
526 
527 
528     /* Read the rest of the row */
529     while (1) {
530       ival = (int)strtol(head, &tail, 0);
531       if (tail == head)
532         break;
533       head = tail;
534 
535       if ((rowind[k] = ival + numbering) < 0)
536         gk_errexit(SIGERR, "Error: Invalid column number %d at row %zd.\n", ival, i);
537 
538       ncols = gk_max(rowind[k], ncols);
539 
540       if (readvals == 1) {
541 #ifdef __MSC__
542         fval = (float)strtod(head, &tail);
543 #else
544 	fval = strtof(head, &tail);
545 #endif
546         if (tail == head)
547           gk_errexit(SIGERR, "Value could not be found for column! Row:%zd, NNZ:%zd\n", i, k);
548         head = tail;
549 
550         rowval[k] = fval;
551       }
552       k++;
553     }
554     rowptr[i+1] = k;
555   }
556 
557   if (format == GK_CSR_FMT_METIS) {
558     ASSERT(ncols+1 == mat->nrows);
559     mat->ncols = mat->nrows;
560   }
561   else {
562     mat->ncols = ncols+1;
563   }
564 
565   if (k != nnz)
566     gk_errexit(SIGERR, "gk_csr_Read: Something wrong with the number of nonzeros in "
567                        "the input file. NNZ=%zd, ActualNNZ=%zd.\n", nnz, k);
568 
569   gk_fclose(fpin);
570 
571   gk_free((void **)&line, LTERM);
572 
573   return mat;
574 }
575 
576 
577 /**************************************************************************/
578 /*! Writes the row-based structure of a matrix into a file.
579     \param mat is the matrix to be written,
580     \param filename is the name of the output file.
581     \param format is one of: GK_CSR_FMT_CLUTO, GK_CSR_FMT_CSR,
582            GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL.
583     \param writevals is either 1 or 0 indicating if the values will be
584            written or not. This is only applicable when GK_CSR_FMT_CSR
585            is used.
586     \param numbering is either 1 or 0 indicating if the internal 0-based
587            numbering will be shifted by one or not during output. This
588            is only applicable when GK_CSR_FMT_CSR is used.
589 */
590 /**************************************************************************/
gk_csr_Write(gk_csr_t * mat,char * filename,int format,int writevals,int numbering)591 void gk_csr_Write(gk_csr_t *mat, char *filename, int format, int writevals, int numbering)
592 {
593   ssize_t i, j;
594   FILE *fpout;
595 
596   if (format == GK_CSR_FMT_BINROW) {
597     if (filename == NULL)
598       gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
599     fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
600 
601     fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);
602     fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);
603     fwrite(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpout);
604     fwrite(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpout);
605     if (writevals)
606       fwrite(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpout);
607 
608     gk_fclose(fpout);
609     return;
610   }
611 
612   if (format == GK_CSR_FMT_BINCOL) {
613     if (filename == NULL)
614       gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
615     fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
616 
617     fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);
618     fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);
619     fwrite(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpout);
620     fwrite(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpout);
621     if (writevals)
622       fwrite(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpout);
623 
624     gk_fclose(fpout);
625     return;
626   }
627 
628   if (filename)
629     fpout = gk_fopen(filename, "w", "gk_csr_Write: fpout");
630   else
631     fpout = stdout;
632 
633   if (format == GK_CSR_FMT_CLUTO) {
634     fprintf(fpout, "%d %d %zd\n", mat->nrows, mat->ncols, mat->rowptr[mat->nrows]);
635     writevals = 1;
636     numbering = 1;
637   }
638 
639   for (i=0; i<mat->nrows; i++) {
640     for (j=mat->rowptr[i]; j<mat->rowptr[i+1]; j++) {
641       fprintf(fpout, " %d", mat->rowind[j]+(numbering ? 1 : 0));
642       if (writevals)
643         fprintf(fpout, " %f", mat->rowval[j]);
644     }
645     fprintf(fpout, "\n");
646   }
647   if (filename)
648     gk_fclose(fpout);
649 }
650 
651 
652 /*************************************************************************/
653 /*! Prunes certain rows/columns of the matrix. The prunning takes place
654     by analyzing the row structure of the matrix. The prunning takes place
655     by removing rows/columns but it does not affect the numbering of the
656     remaining rows/columns.
657 
658     \param mat the matrix to be prunned,
659     \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
660            of the matrix will be prunned,
661     \param minf is the minimum number of rows (columns) that a column (row) must
662            be present in order to be kept,
663     \param maxf is the maximum number of rows (columns) that a column (row) must
664           be present at in order to be kept.
665     \returns the prunned matrix consisting only of its row-based structure.
666           The input matrix is not modified.
667 */
668 /**************************************************************************/
gk_csr_Prune(gk_csr_t * mat,int what,int minf,int maxf)669 gk_csr_t *gk_csr_Prune(gk_csr_t *mat, int what, int minf, int maxf)
670 {
671   ssize_t i, j, nnz;
672   int nrows, ncols;
673   ssize_t *rowptr, *nrowptr;
674   int *rowind, *nrowind, *collen;
675   float *rowval, *nrowval;
676   gk_csr_t *nmat;
677 
678   nmat = gk_csr_Create();
679 
680   nrows = nmat->nrows = mat->nrows;
681   ncols = nmat->ncols = mat->ncols;
682 
683   rowptr = mat->rowptr;
684   rowind = mat->rowind;
685   rowval = mat->rowval;
686 
687   nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Prune: nrowptr");
688   nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_Prune: nrowind");
689   nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_Prune: nrowval");
690 
691 
692   switch (what) {
693     case GK_CSR_COL:
694       collen = gk_ismalloc(ncols, 0, "gk_csr_Prune: collen");
695 
696       for (i=0; i<nrows; i++) {
697         for (j=rowptr[i]; j<rowptr[i+1]; j++) {
698           ASSERT(rowind[j] < ncols);
699           collen[rowind[j]]++;
700         }
701       }
702       for (i=0; i<ncols; i++)
703         collen[i] = (collen[i] >= minf && collen[i] <= maxf ? 1 : 0);
704 
705       nrowptr[0] = 0;
706       for (nnz=0, i=0; i<nrows; i++) {
707         for (j=rowptr[i]; j<rowptr[i+1]; j++) {
708           if (collen[rowind[j]]) {
709             nrowind[nnz] = rowind[j];
710             nrowval[nnz] = rowval[j];
711             nnz++;
712           }
713         }
714         nrowptr[i+1] = nnz;
715       }
716       gk_free((void **)&collen, LTERM);
717       break;
718 
719     case GK_CSR_ROW:
720       nrowptr[0] = 0;
721       for (nnz=0, i=0; i<nrows; i++) {
722         if (rowptr[i+1]-rowptr[i] >= minf && rowptr[i+1]-rowptr[i] <= maxf) {
723           for (j=rowptr[i]; j<rowptr[i+1]; j++, nnz++) {
724             nrowind[nnz] = rowind[j];
725             nrowval[nnz] = rowval[j];
726           }
727         }
728         nrowptr[i+1] = nnz;
729       }
730       break;
731 
732     default:
733       gk_csr_Free(&nmat);
734       gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
735       return NULL;
736   }
737 
738   return nmat;
739 }
740 
741 
742 /*************************************************************************/
743 /*! Eliminates certain entries from the rows/columns of the matrix. The
744     filtering takes place by keeping only the highest weight entries whose
745     sum accounts for a certain fraction of the overall weight of the
746     row/column.
747 
748     \param mat the matrix to be prunned,
749     \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
750            of the matrix will be prunned,
751     \param norm indicates the norm that will be used to aggregate the weights
752            and possible values are 1 or 2,
753     \param fraction is the fraction of the overall norm that will be retained
754            by the kept entries.
755     \returns the filtered matrix consisting only of its row-based structure.
756            The input matrix is not modified.
757 */
758 /**************************************************************************/
gk_csr_LowFilter(gk_csr_t * mat,int what,int norm,float fraction)759 gk_csr_t *gk_csr_LowFilter(gk_csr_t *mat, int what, int norm, float fraction)
760 {
761   ssize_t i, j, nnz;
762   int nrows, ncols, ncand, maxlen=0;
763   ssize_t *rowptr, *colptr, *nrowptr;
764   int *rowind, *colind, *nrowind;
765   float *rowval, *colval, *nrowval, rsum, tsum;
766   gk_csr_t *nmat;
767   gk_fkv_t *cand;
768 
769   nmat = gk_csr_Create();
770 
771   nrows = nmat->nrows = mat->nrows;
772   ncols = nmat->ncols = mat->ncols;
773 
774   rowptr = mat->rowptr;
775   rowind = mat->rowind;
776   rowval = mat->rowval;
777   colptr = mat->colptr;
778   colind = mat->colind;
779   colval = mat->colval;
780 
781   nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
782   nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
783   nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
784 
785 
786   switch (what) {
787     case GK_CSR_COL:
788       if (mat->colptr == NULL)
789         gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
790 
791       gk_zcopy(nrows+1, rowptr, nrowptr);
792 
793       for (i=0; i<ncols; i++)
794         maxlen = gk_max(maxlen, colptr[i+1]-colptr[i]);
795 
796       #pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
797       {
798         cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
799 
800         #pragma omp for schedule(static)
801         for (i=0; i<ncols; i++) {
802           for (tsum=0.0, ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
803             cand[ncand].val = colind[j];
804             cand[ncand].key = colval[j];
805             tsum += (norm == 1 ? colval[j] : colval[j]*colval[j]);
806           }
807           gk_fkvsortd(ncand, cand);
808 
809           for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
810             rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
811             nrowind[nrowptr[cand[j].val]] = i;
812             nrowval[nrowptr[cand[j].val]] = cand[j].key;
813             nrowptr[cand[j].val]++;
814           }
815         }
816 
817         gk_free((void **)&cand, LTERM);
818       }
819 
820       /* compact the nrowind/nrowval */
821       for (nnz=0, i=0; i<nrows; i++) {
822         for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
823           nrowind[nnz] = nrowind[j];
824           nrowval[nnz] = nrowval[j];
825         }
826         nrowptr[i] = nnz;
827       }
828       SHIFTCSR(i, nrows, nrowptr);
829 
830       break;
831 
832     case GK_CSR_ROW:
833       if (mat->rowptr == NULL)
834         gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
835 
836       for (i=0; i<nrows; i++)
837         maxlen = gk_max(maxlen, rowptr[i+1]-rowptr[i]);
838 
839       #pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
840       {
841         cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
842 
843         #pragma omp for schedule(static)
844         for (i=0; i<nrows; i++) {
845           for (tsum=0.0, ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
846             cand[ncand].val = rowind[j];
847             cand[ncand].key = rowval[j];
848             tsum += (norm == 1 ? rowval[j] : rowval[j]*rowval[j]);
849           }
850           gk_fkvsortd(ncand, cand);
851 
852           for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
853             rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
854             nrowind[rowptr[i]+j] = cand[j].val;
855             nrowval[rowptr[i]+j] = cand[j].key;
856           }
857           nrowptr[i+1] = rowptr[i]+j;
858         }
859 
860         gk_free((void **)&cand, LTERM);
861       }
862 
863       /* compact nrowind/nrowval */
864       nrowptr[0] = nnz = 0;
865       for (i=0; i<nrows; i++) {
866         for (j=rowptr[i]; j<nrowptr[i+1]; j++, nnz++) {
867           nrowind[nnz] = nrowind[j];
868           nrowval[nnz] = nrowval[j];
869         }
870         nrowptr[i+1] = nnz;
871       }
872 
873       break;
874 
875     default:
876       gk_csr_Free(&nmat);
877       gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
878       return NULL;
879   }
880 
881   return nmat;
882 }
883 
884 
885 /*************************************************************************/
886 /*! Eliminates certain entries from the rows/columns of the matrix. The
887     filtering takes place by keeping only the highest weight top-K entries
888     along each row/column and those entries whose weight is greater than
889     a specified value.
890 
891     \param mat the matrix to be prunned,
892     \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
893            of the matrix will be prunned,
894     \param topk is the number of the highest weight entries to keep.
895     \param keepval is the weight of a term above which will be kept. This
896            is used to select additional terms past the first topk.
897     \returns the filtered matrix consisting only of its row-based structure.
898            The input matrix is not modified.
899 */
900 /**************************************************************************/
gk_csr_TopKPlusFilter(gk_csr_t * mat,int what,int topk,float keepval)901 gk_csr_t *gk_csr_TopKPlusFilter(gk_csr_t *mat, int what, int topk, float keepval)
902 {
903   ssize_t i, j, k, nnz;
904   int nrows, ncols, ncand;
905   ssize_t *rowptr, *colptr, *nrowptr;
906   int *rowind, *colind, *nrowind;
907   float *rowval, *colval, *nrowval;
908   gk_csr_t *nmat;
909   gk_fkv_t *cand;
910 
911   nmat = gk_csr_Create();
912 
913   nrows = nmat->nrows = mat->nrows;
914   ncols = nmat->ncols = mat->ncols;
915 
916   rowptr = mat->rowptr;
917   rowind = mat->rowind;
918   rowval = mat->rowval;
919   colptr = mat->colptr;
920   colind = mat->colind;
921   colval = mat->colval;
922 
923   nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
924   nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
925   nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
926 
927 
928   switch (what) {
929     case GK_CSR_COL:
930       if (mat->colptr == NULL)
931         gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
932 
933       cand = gk_fkvmalloc(nrows, "gk_csr_LowFilter: cand");
934 
935       gk_zcopy(nrows+1, rowptr, nrowptr);
936       for (i=0; i<ncols; i++) {
937         for (ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
938           cand[ncand].val = colind[j];
939           cand[ncand].key = colval[j];
940         }
941         gk_fkvsortd(ncand, cand);
942 
943         k = gk_min(topk, ncand);
944         for (j=0; j<k; j++) {
945           nrowind[nrowptr[cand[j].val]] = i;
946           nrowval[nrowptr[cand[j].val]] = cand[j].key;
947           nrowptr[cand[j].val]++;
948         }
949         for (; j<ncand; j++) {
950           if (cand[j].key < keepval)
951             break;
952 
953           nrowind[nrowptr[cand[j].val]] = i;
954           nrowval[nrowptr[cand[j].val]] = cand[j].key;
955           nrowptr[cand[j].val]++;
956         }
957       }
958 
959       /* compact the nrowind/nrowval */
960       for (nnz=0, i=0; i<nrows; i++) {
961         for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
962           nrowind[nnz] = nrowind[j];
963           nrowval[nnz] = nrowval[j];
964         }
965         nrowptr[i] = nnz;
966       }
967       SHIFTCSR(i, nrows, nrowptr);
968 
969       gk_free((void **)&cand, LTERM);
970       break;
971 
972     case GK_CSR_ROW:
973       if (mat->rowptr == NULL)
974         gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
975 
976       cand = gk_fkvmalloc(ncols, "gk_csr_LowFilter: cand");
977 
978       nrowptr[0] = 0;
979       for (nnz=0, i=0; i<nrows; i++) {
980         for (ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
981           cand[ncand].val = rowind[j];
982           cand[ncand].key = rowval[j];
983         }
984         gk_fkvsortd(ncand, cand);
985 
986         k = gk_min(topk, ncand);
987         for (j=0; j<k; j++, nnz++) {
988           nrowind[nnz] = cand[j].val;
989           nrowval[nnz] = cand[j].key;
990         }
991         for (; j<ncand; j++, nnz++) {
992           if (cand[j].key < keepval)
993             break;
994 
995           nrowind[nnz] = cand[j].val;
996           nrowval[nnz] = cand[j].key;
997         }
998         nrowptr[i+1] = nnz;
999       }
1000 
1001       gk_free((void **)&cand, LTERM);
1002       break;
1003 
1004     default:
1005       gk_csr_Free(&nmat);
1006       gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
1007       return NULL;
1008   }
1009 
1010   return nmat;
1011 }
1012 
1013 
1014 /*************************************************************************/
1015 /*! Eliminates certain entries from the rows/columns of the matrix. The
1016     filtering takes place by keeping only the terms whose contribution to
1017     the total length of the document is greater than a user-splied multiple
1018     over the average.
1019 
1020     This routine assumes that the vectors are normalized to be unit length.
1021 
1022     \param mat the matrix to be prunned,
1023     \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
1024            of the matrix will be prunned,
1025     \param zscore is the multiplicative factor over the average contribution
1026            to the length of the document.
1027     \returns the filtered matrix consisting only of its row-based structure.
1028            The input matrix is not modified.
1029 */
1030 /**************************************************************************/
gk_csr_ZScoreFilter(gk_csr_t * mat,int what,float zscore)1031 gk_csr_t *gk_csr_ZScoreFilter(gk_csr_t *mat, int what, float zscore)
1032 {
1033   ssize_t i, j, nnz;
1034   int nrows;
1035   ssize_t *rowptr, *nrowptr;
1036   int *rowind, *nrowind;
1037   float *rowval, *nrowval, avgwgt;
1038   gk_csr_t *nmat;
1039 
1040   nmat = gk_csr_Create();
1041 
1042   nmat->nrows = mat->nrows;
1043   nmat->ncols = mat->ncols;
1044 
1045   nrows  = mat->nrows;
1046   rowptr = mat->rowptr;
1047   rowind = mat->rowind;
1048   rowval = mat->rowval;
1049 
1050   nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_ZScoreFilter: nrowptr");
1051   nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowind");
1052   nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowval");
1053 
1054 
1055   switch (what) {
1056     case GK_CSR_COL:
1057       gk_errexit(SIGERR, "This has not been implemented yet.\n");
1058       break;
1059 
1060     case GK_CSR_ROW:
1061       if (mat->rowptr == NULL)
1062         gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
1063 
1064       nrowptr[0] = 0;
1065       for (nnz=0, i=0; i<nrows; i++) {
1066         avgwgt = zscore/(rowptr[i+1]-rowptr[i]);
1067         for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1068           if (rowval[j] > avgwgt) {
1069             nrowind[nnz] = rowind[j];
1070             nrowval[nnz] = rowval[j];
1071             nnz++;
1072           }
1073         }
1074         nrowptr[i+1] = nnz;
1075       }
1076       break;
1077 
1078     default:
1079       gk_csr_Free(&nmat);
1080       gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
1081       return NULL;
1082   }
1083 
1084   return nmat;
1085 }
1086 
1087 
1088 /*************************************************************************/
1089 /*! Compacts the column-space of the matrix by removing empty columns.
1090     As a result of the compaction, the column numbers are renumbered.
1091     The compaction operation is done in place and only affects the row-based
1092     representation of the matrix.
1093     The new columns are ordered in decreasing frequency.
1094 
1095     \param mat the matrix whose empty columns will be removed.
1096 */
1097 /**************************************************************************/
gk_csr_CompactColumns(gk_csr_t * mat)1098 void gk_csr_CompactColumns(gk_csr_t *mat)
1099 {
1100   ssize_t i;
1101   int nrows, ncols, nncols;
1102   ssize_t *rowptr;
1103   int *rowind, *colmap;
1104   gk_ikv_t *clens;
1105 
1106   nrows  = mat->nrows;
1107   ncols  = mat->ncols;
1108   rowptr = mat->rowptr;
1109   rowind = mat->rowind;
1110 
1111   colmap = gk_imalloc(ncols, "gk_csr_CompactColumns: colmap");
1112 
1113   clens = gk_ikvmalloc(ncols, "gk_csr_CompactColumns: clens");
1114   for (i=0; i<ncols; i++) {
1115     clens[i].key = 0;
1116     clens[i].val = i;
1117   }
1118 
1119   for (i=0; i<rowptr[nrows]; i++)
1120     clens[rowind[i]].key++;
1121   gk_ikvsortd(ncols, clens);
1122 
1123   for (nncols=0, i=0; i<ncols; i++) {
1124     if (clens[i].key > 0)
1125       colmap[clens[i].val] = nncols++;
1126     else
1127       break;
1128   }
1129 
1130   for (i=0; i<rowptr[nrows]; i++)
1131     rowind[i] = colmap[rowind[i]];
1132 
1133   mat->ncols = nncols;
1134 
1135   gk_free((void **)&colmap, &clens, LTERM);
1136 }
1137 
1138 
1139 /*************************************************************************/
1140 /*! Sorts the indices in increasing order
1141     \param mat the matrix itself,
1142     \param what is either GK_CSR_ROW or GK_CSR_COL indicating which set of
1143            indices to sort.
1144 */
1145 /**************************************************************************/
gk_csr_SortIndices(gk_csr_t * mat,int what)1146 void gk_csr_SortIndices(gk_csr_t *mat, int what)
1147 {
1148   int n, nn=0;
1149   ssize_t *ptr;
1150   int *ind;
1151   float *val;
1152 
1153   switch (what) {
1154     case GK_CSR_ROW:
1155       if (!mat->rowptr)
1156         gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
1157 
1158       n   = mat->nrows;
1159       ptr = mat->rowptr;
1160       ind = mat->rowind;
1161       val = mat->rowval;
1162       break;
1163 
1164     case GK_CSR_COL:
1165       if (!mat->colptr)
1166         gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
1167 
1168       n   = mat->ncols;
1169       ptr = mat->colptr;
1170       ind = mat->colind;
1171       val = mat->colval;
1172       break;
1173 
1174     default:
1175       gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1176       return;
1177   }
1178 
1179   #pragma omp parallel if (n > 100)
1180   {
1181     ssize_t i, j, k;
1182     gk_ikv_t *cand;
1183     float *tval;
1184 
1185     #pragma omp single
1186     for (i=0; i<n; i++)
1187       nn = gk_max(nn, ptr[i+1]-ptr[i]);
1188 
1189     cand = gk_ikvmalloc(nn, "gk_csr_SortIndices: cand");
1190     tval = gk_fmalloc(nn, "gk_csr_SortIndices: tval");
1191 
1192     #pragma omp for schedule(static)
1193     for (i=0; i<n; i++) {
1194       for (k=0, j=ptr[i]; j<ptr[i+1]; j++) {
1195         if (j > ptr[i] && ind[j] < ind[j-1])
1196           k = 1; /* an inversion */
1197         cand[j-ptr[i]].val = j-ptr[i];
1198         cand[j-ptr[i]].key = ind[j];
1199         tval[j-ptr[i]]     = val[j];
1200       }
1201       if (k) {
1202         gk_ikvsorti(ptr[i+1]-ptr[i], cand);
1203         for (j=ptr[i]; j<ptr[i+1]; j++) {
1204           ind[j] = cand[j-ptr[i]].key;
1205           val[j] = tval[cand[j-ptr[i]].val];
1206         }
1207       }
1208     }
1209 
1210     gk_free((void **)&cand, &tval, LTERM);
1211   }
1212 
1213 }
1214 
1215 
1216 /*************************************************************************/
1217 /*! Creates a row/column index from the column/row data.
1218     \param mat the matrix itself,
1219     \param what is either GK_CSR_ROW or GK_CSR_COL indicating which index
1220            will be created.
1221 */
1222 /**************************************************************************/
gk_csr_CreateIndex(gk_csr_t * mat,int what)1223 void gk_csr_CreateIndex(gk_csr_t *mat, int what)
1224 {
1225   /* 'f' stands for forward, 'r' stands for reverse */
1226   ssize_t i, j, k, nf, nr;
1227   ssize_t *fptr, *rptr;
1228   int *find, *rind;
1229   float *fval, *rval;
1230 
1231   switch (what) {
1232     case GK_CSR_COL:
1233       nf   = mat->nrows;
1234       fptr = mat->rowptr;
1235       find = mat->rowind;
1236       fval = mat->rowval;
1237 
1238       if (mat->colptr) gk_free((void **)&mat->colptr, LTERM);
1239       if (mat->colind) gk_free((void **)&mat->colind, LTERM);
1240       if (mat->colval) gk_free((void **)&mat->colval, LTERM);
1241 
1242       nr   = mat->ncols;
1243       rptr = mat->colptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
1244       rind = mat->colind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
1245       rval = mat->colval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
1246       break;
1247     case GK_CSR_ROW:
1248       nf   = mat->ncols;
1249       fptr = mat->colptr;
1250       find = mat->colind;
1251       fval = mat->colval;
1252 
1253       if (mat->rowptr) gk_free((void **)&mat->rowptr, LTERM);
1254       if (mat->rowind) gk_free((void **)&mat->rowind, LTERM);
1255       if (mat->rowval) gk_free((void **)&mat->rowval, LTERM);
1256 
1257       nr   = mat->nrows;
1258       rptr = mat->rowptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
1259       rind = mat->rowind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
1260       rval = mat->rowval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
1261       break;
1262     default:
1263       gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1264       return;
1265   }
1266 
1267 
1268   for (i=0; i<nf; i++) {
1269     for (j=fptr[i]; j<fptr[i+1]; j++)
1270       rptr[find[j]]++;
1271   }
1272   MAKECSR(i, nr, rptr);
1273 
1274   if (rptr[nr] > 6*nr) {
1275     for (i=0; i<nf; i++) {
1276       for (j=fptr[i]; j<fptr[i+1]; j++)
1277         rind[rptr[find[j]]++] = i;
1278     }
1279     SHIFTCSR(i, nr, rptr);
1280 
1281     if (fval) {
1282       for (i=0; i<nf; i++) {
1283         for (j=fptr[i]; j<fptr[i+1]; j++)
1284           rval[rptr[find[j]]++] = fval[j];
1285       }
1286       SHIFTCSR(i, nr, rptr);
1287     }
1288   }
1289   else {
1290     if (fval) {
1291       for (i=0; i<nf; i++) {
1292         for (j=fptr[i]; j<fptr[i+1]; j++) {
1293           k = find[j];
1294           rind[rptr[k]]   = i;
1295           rval[rptr[k]++] = fval[j];
1296         }
1297       }
1298     }
1299     else {
1300       for (i=0; i<nf; i++) {
1301         for (j=fptr[i]; j<fptr[i+1]; j++)
1302           rind[rptr[find[j]]++] = i;
1303       }
1304     }
1305     SHIFTCSR(i, nr, rptr);
1306   }
1307 }
1308 
1309 
1310 /*************************************************************************/
1311 /*! Normalizes the rows/columns of the matrix to be unit
1312     length.
1313     \param mat the matrix itself,
1314     \param what indicates what will be normalized and is obtained by
1315            specifying GK_CSR_ROW, GK_CSR_COL, GK_CSR_ROW|GK_CSR_COL.
1316     \param norm indicates what norm is to normalize to, 1: 1-norm, 2: 2-norm
1317 */
1318 /**************************************************************************/
gk_csr_Normalize(gk_csr_t * mat,int what,int norm)1319 void gk_csr_Normalize(gk_csr_t *mat, int what, int norm)
1320 {
1321   ssize_t i, j;
1322   int n;
1323   ssize_t *ptr;
1324   float *val, sum;
1325 
1326   if (what&GK_CSR_ROW && mat->rowval) {
1327     n   = mat->nrows;
1328     ptr = mat->rowptr;
1329     val = mat->rowval;
1330 
1331     #pragma omp parallel if (ptr[n] > OMPMINOPS)
1332     {
1333       #pragma omp for private(j,sum) schedule(static)
1334       for (i=0; i<n; i++) {
1335         for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++){
1336   	if (norm == 2)
1337   	  sum += val[j]*val[j];
1338   	else if (norm == 1)
1339   	  sum += val[j]; /* assume val[j] > 0 */
1340         }
1341         if (sum > 0) {
1342   	if (norm == 2)
1343   	  sum=1.0/sqrt(sum);
1344   	else if (norm == 1)
1345   	  sum=1.0/sum;
1346           for (j=ptr[i]; j<ptr[i+1]; j++)
1347             val[j] *= sum;
1348 
1349         }
1350       }
1351     }
1352   }
1353 
1354   if (what&GK_CSR_COL && mat->colval) {
1355     n   = mat->ncols;
1356     ptr = mat->colptr;
1357     val = mat->colval;
1358 
1359     #pragma omp parallel if (ptr[n] > OMPMINOPS)
1360     {
1361     #pragma omp for private(j,sum) schedule(static)
1362       for (i=0; i<n; i++) {
1363         for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++)
1364   	if (norm == 2)
1365   	  sum += val[j]*val[j];
1366   	else if (norm == 1)
1367   	  sum += val[j];
1368         if (sum > 0) {
1369   	if (norm == 2)
1370   	  sum=1.0/sqrt(sum);
1371   	else if (norm == 1)
1372   	  sum=1.0/sum;
1373           for (j=ptr[i]; j<ptr[i+1]; j++)
1374             val[j] *= sum;
1375         }
1376       }
1377     }
1378   }
1379 }
1380 
1381 
1382 /*************************************************************************/
1383 /*! Applies different row scaling methods.
1384     \param mat the matrix itself,
1385     \param type indicates the type of row scaling. Possible values are:
1386            GK_CSR_MAXTF, GK_CSR_SQRT, GK_CSR_LOG, GK_CSR_IDF, GK_CSR_MAXTF2.
1387 */
1388 /**************************************************************************/
gk_csr_Scale(gk_csr_t * mat,int type)1389 void gk_csr_Scale(gk_csr_t *mat, int type)
1390 {
1391   ssize_t i, j;
1392   int nrows, ncols, nnzcols, bgfreq;
1393   ssize_t *rowptr;
1394   int *rowind, *collen;
1395   float *rowval, *cscale, maxtf;
1396 
1397   nrows  = mat->nrows;
1398   rowptr = mat->rowptr;
1399   rowind = mat->rowind;
1400   rowval = mat->rowval;
1401 
1402   switch (type) {
1403     case GK_CSR_MAXTF: /* TF' = .5 + .5*TF/MAX(TF) */
1404       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1405       {
1406         #pragma omp for private(j, maxtf) schedule(static)
1407         for (i=0; i<nrows; i++) {
1408           maxtf = fabs(rowval[rowptr[i]]);
1409           for (j=rowptr[i]; j<rowptr[i+1]; j++)
1410             maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
1411 
1412           for (j=rowptr[i]; j<rowptr[i+1]; j++)
1413             rowval[j] = .5 + .5*rowval[j]/maxtf;
1414         }
1415       }
1416       break;
1417 
1418     case GK_CSR_MAXTF2: /* TF' = .1 + .9*TF/MAX(TF) */
1419       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1420       {
1421         #pragma omp for private(j, maxtf) schedule(static)
1422         for (i=0; i<nrows; i++) {
1423           maxtf = fabs(rowval[rowptr[i]]);
1424           for (j=rowptr[i]; j<rowptr[i+1]; j++)
1425             maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
1426 
1427           for (j=rowptr[i]; j<rowptr[i+1]; j++)
1428             rowval[j] = .1 + .9*rowval[j]/maxtf;
1429         }
1430       }
1431       break;
1432 
1433     case GK_CSR_SQRT: /* TF' = .1+SQRT(TF) */
1434       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1435       {
1436         #pragma omp for private(j) schedule(static)
1437         for (i=0; i<nrows; i++) {
1438           for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1439             if (rowval[j] != 0.0)
1440               rowval[j] = .1+sign(rowval[j], sqrt(fabs(rowval[j])));
1441           }
1442         }
1443       }
1444       break;
1445 
1446     case GK_CSR_POW25: /* TF' = .1+POW(TF,.25) */
1447       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1448       {
1449         #pragma omp for private(j) schedule(static)
1450         for (i=0; i<nrows; i++) {
1451           for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1452             if (rowval[j] != 0.0)
1453               rowval[j] = .1+sign(rowval[j], sqrt(sqrt(fabs(rowval[j]))));
1454           }
1455         }
1456       }
1457       break;
1458 
1459     case GK_CSR_POW65: /* TF' = .1+POW(TF,.65) */
1460       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1461       {
1462         #pragma omp for private(j) schedule(static)
1463         for (i=0; i<nrows; i++) {
1464           for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1465             if (rowval[j] != 0.0)
1466               rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .65));
1467           }
1468         }
1469       }
1470       break;
1471 
1472     case GK_CSR_POW75: /* TF' = .1+POW(TF,.75) */
1473       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1474       {
1475         #pragma omp for private(j) schedule(static)
1476         for (i=0; i<nrows; i++) {
1477           for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1478             if (rowval[j] != 0.0)
1479               rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .75));
1480           }
1481         }
1482       }
1483       break;
1484 
1485     case GK_CSR_POW85: /* TF' = .1+POW(TF,.85) */
1486       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1487       {
1488         #pragma omp for private(j) schedule(static)
1489         for (i=0; i<nrows; i++) {
1490           for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1491             if (rowval[j] != 0.0)
1492               rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .85));
1493           }
1494         }
1495       }
1496       break;
1497 
1498     case GK_CSR_LOG: /* TF' = 1+log_2(TF) */
1499       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1500       {
1501         double logscale = 1.0/log(2.0);
1502         #pragma omp for schedule(static,32)
1503         for (i=0; i<rowptr[nrows]; i++) {
1504           if (rowval[i] != 0.0)
1505             rowval[i] = 1+(rowval[i]>0.0 ? log(rowval[i]) : -log(-rowval[i]))*logscale;
1506         }
1507 #ifdef XXX
1508         #pragma omp for private(j) schedule(static)
1509         for (i=0; i<nrows; i++) {
1510           for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1511             if (rowval[j] != 0.0)
1512               rowval[j] = 1+(rowval[j]>0.0 ? log(rowval[j]) : -log(-rowval[j]))*logscale;
1513               //rowval[j] = 1+sign(rowval[j], log(fabs(rowval[j]))*logscale);
1514           }
1515         }
1516 #endif
1517       }
1518       break;
1519 
1520     case GK_CSR_IDF: /* TF' = TF*IDF */
1521       ncols  = mat->ncols;
1522       cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
1523       collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
1524 
1525       for (i=0; i<nrows; i++) {
1526         for (j=rowptr[i]; j<rowptr[i+1]; j++)
1527           collen[rowind[j]]++;
1528       }
1529 
1530       #pragma omp parallel if (ncols > OMPMINOPS)
1531       {
1532         #pragma omp for schedule(static)
1533         for (i=0; i<ncols; i++)
1534           cscale[i] = (collen[i] > 0 ? log(1.0*nrows/collen[i]) : 0.0);
1535       }
1536 
1537       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1538       {
1539         #pragma omp for private(j) schedule(static)
1540         for (i=0; i<nrows; i++) {
1541           for (j=rowptr[i]; j<rowptr[i+1]; j++)
1542             rowval[j] *= cscale[rowind[j]];
1543         }
1544       }
1545 
1546       gk_free((void **)&cscale, &collen, LTERM);
1547       break;
1548 
1549     case GK_CSR_IDF2: /* TF' = TF*IDF */
1550       ncols  = mat->ncols;
1551       cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
1552       collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
1553 
1554       for (i=0; i<nrows; i++) {
1555         for (j=rowptr[i]; j<rowptr[i+1]; j++)
1556           collen[rowind[j]]++;
1557       }
1558 
1559       nnzcols = 0;
1560       #pragma omp parallel if (ncols > OMPMINOPS)
1561       {
1562         #pragma omp for schedule(static) reduction(+:nnzcols)
1563         for (i=0; i<ncols; i++)
1564           nnzcols += (collen[i] > 0 ? 1 : 0);
1565 
1566         bgfreq = gk_max(10, (ssize_t)(.5*rowptr[nrows]/nnzcols));
1567         printf("nnz: %zd, nnzcols: %d, bgfreq: %d\n", rowptr[nrows], nnzcols, bgfreq);
1568 
1569         #pragma omp for schedule(static)
1570         for (i=0; i<ncols; i++)
1571           cscale[i] = (collen[i] > 0 ? log(1.0*(nrows+2*bgfreq)/(bgfreq+collen[i])) : 0.0);
1572       }
1573 
1574       #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1575       {
1576         #pragma omp for private(j) schedule(static)
1577         for (i=0; i<nrows; i++) {
1578           for (j=rowptr[i]; j<rowptr[i+1]; j++)
1579             rowval[j] *= cscale[rowind[j]];
1580         }
1581       }
1582 
1583       gk_free((void **)&cscale, &collen, LTERM);
1584       break;
1585 
1586     default:
1587       gk_errexit(SIGERR, "Unknown scaling type of %d\n", type);
1588   }
1589 
1590 }
1591 
1592 
1593 /*************************************************************************/
1594 /*! Computes the sums of the rows/columns
1595     \param mat the matrix itself,
1596     \param what is either GK_CSR_ROW or GK_CSR_COL indicating which
1597            sums to compute.
1598 */
1599 /**************************************************************************/
gk_csr_ComputeSums(gk_csr_t * mat,int what)1600 void gk_csr_ComputeSums(gk_csr_t *mat, int what)
1601 {
1602   ssize_t i;
1603   int n;
1604   ssize_t *ptr;
1605   float *val, *sums;
1606 
1607   switch (what) {
1608     case GK_CSR_ROW:
1609       n   = mat->nrows;
1610       ptr = mat->rowptr;
1611       val = mat->rowval;
1612 
1613       if (mat->rsums)
1614         gk_free((void **)&mat->rsums, LTERM);
1615 
1616       sums = mat->rsums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
1617       break;
1618     case GK_CSR_COL:
1619       n   = mat->ncols;
1620       ptr = mat->colptr;
1621       val = mat->colval;
1622 
1623       if (mat->csums)
1624         gk_free((void **)&mat->csums, LTERM);
1625 
1626       sums = mat->csums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
1627       break;
1628     default:
1629       gk_errexit(SIGERR, "Invalid sum type of %d.\n", what);
1630       return;
1631   }
1632 
1633   #pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
1634   for (i=0; i<n; i++)
1635     sums[i] = gk_fsum(ptr[i+1]-ptr[i], val+ptr[i], 1);
1636 }
1637 
1638 
1639 /*************************************************************************/
1640 /*! Computes the squared of the norms of the rows/columns
1641     \param mat the matrix itself,
1642     \param what is either GK_CSR_ROW or GK_CSR_COL indicating which
1643            squared norms to compute.
1644 */
1645 /**************************************************************************/
gk_csr_ComputeSquaredNorms(gk_csr_t * mat,int what)1646 void gk_csr_ComputeSquaredNorms(gk_csr_t *mat, int what)
1647 {
1648   ssize_t i;
1649   int n;
1650   ssize_t *ptr;
1651   float *val, *norms;
1652 
1653   switch (what) {
1654     case GK_CSR_ROW:
1655       n   = mat->nrows;
1656       ptr = mat->rowptr;
1657       val = mat->rowval;
1658 
1659       if (mat->rnorms) gk_free((void **)&mat->rnorms, LTERM);
1660 
1661       norms = mat->rnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
1662       break;
1663     case GK_CSR_COL:
1664       n   = mat->ncols;
1665       ptr = mat->colptr;
1666       val = mat->colval;
1667 
1668       if (mat->cnorms) gk_free((void **)&mat->cnorms, LTERM);
1669 
1670       norms = mat->cnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
1671       break;
1672     default:
1673       gk_errexit(SIGERR, "Invalid norm type of %d.\n", what);
1674       return;
1675   }
1676 
1677   #pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
1678   for (i=0; i<n; i++)
1679     norms[i] = gk_fdot(ptr[i+1]-ptr[i], val+ptr[i], 1, val+ptr[i], 1);
1680 }
1681 
1682 
1683 /*************************************************************************/
1684 /*! Computes the similarity between two rows/columns
1685 
1686     \param mat the matrix itself. The routine assumes that the indices
1687            are sorted in increasing order.
1688     \param i1 is the first row/column,
1689     \param i2 is the second row/column,
1690     \param what is either GK_CSR_ROW or GK_CSR_COL indicating the type of
1691            objects between the similarity will be computed,
1692     \param simtype is the type of similarity and is one of GK_CSR_COS,
1693            GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN
1694     \returns the similarity between the two rows/columns.
1695 */
1696 /**************************************************************************/
gk_csr_ComputeSimilarity(gk_csr_t * mat,int i1,int i2,int what,int simtype)1697 float gk_csr_ComputeSimilarity(gk_csr_t *mat, int i1, int i2, int what, int simtype)
1698 {
1699   int nind1, nind2;
1700   int *ind1, *ind2;
1701   float *val1, *val2, stat1, stat2, sim;
1702 
1703   switch (what) {
1704     case GK_CSR_ROW:
1705       if (!mat->rowptr)
1706         gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
1707       nind1 = mat->rowptr[i1+1]-mat->rowptr[i1];
1708       nind2 = mat->rowptr[i2+1]-mat->rowptr[i2];
1709       ind1  = mat->rowind + mat->rowptr[i1];
1710       ind2  = mat->rowind + mat->rowptr[i2];
1711       val1  = mat->rowval + mat->rowptr[i1];
1712       val2  = mat->rowval + mat->rowptr[i2];
1713       break;
1714 
1715     case GK_CSR_COL:
1716       if (!mat->colptr)
1717         gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
1718       nind1 = mat->colptr[i1+1]-mat->colptr[i1];
1719       nind2 = mat->colptr[i2+1]-mat->colptr[i2];
1720       ind1  = mat->colind + mat->colptr[i1];
1721       ind2  = mat->colind + mat->colptr[i2];
1722       val1  = mat->colval + mat->colptr[i1];
1723       val2  = mat->colval + mat->colptr[i2];
1724       break;
1725 
1726     default:
1727       gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1728       return 0.0;
1729   }
1730 
1731 
1732   switch (simtype) {
1733     case GK_CSR_COS:
1734     case GK_CSR_JAC:
1735       sim = stat1 = stat2 = 0.0;
1736       i1 = i2 = 0;
1737       while (i1<nind1 && i2<nind2) {
1738         if (i1 == nind1) {
1739           stat2 += val2[i2]*val2[i2];
1740           i2++;
1741         }
1742         else if (i2 == nind2) {
1743           stat1 += val1[i1]*val1[i1];
1744           i1++;
1745         }
1746         else if (ind1[i1] < ind2[i2]) {
1747           stat1 += val1[i1]*val1[i1];
1748           i1++;
1749         }
1750         else if (ind1[i1] > ind2[i2]) {
1751           stat2 += val2[i2]*val2[i2];
1752           i2++;
1753         }
1754         else {
1755           sim   += val1[i1]*val2[i2];
1756           stat1 += val1[i1]*val1[i1];
1757           stat2 += val2[i2]*val2[i2];
1758           i1++;
1759           i2++;
1760         }
1761       }
1762       if (simtype == GK_CSR_COS)
1763         sim = (stat1*stat2 > 0.0 ? sim/sqrt(stat1*stat2) : 0.0);
1764       else
1765         sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
1766       break;
1767 
1768     case GK_CSR_MIN:
1769       sim = stat1 = stat2 = 0.0;
1770       i1 = i2 = 0;
1771       while (i1<nind1 && i2<nind2) {
1772         if (i1 == nind1) {
1773           stat2 += val2[i2];
1774           i2++;
1775         }
1776         else if (i2 == nind2) {
1777           stat1 += val1[i1];
1778           i1++;
1779         }
1780         else if (ind1[i1] < ind2[i2]) {
1781           stat1 += val1[i1];
1782           i1++;
1783         }
1784         else if (ind1[i1] > ind2[i2]) {
1785           stat2 += val2[i2];
1786           i2++;
1787         }
1788         else {
1789           sim   += gk_min(val1[i1],val2[i2]);
1790           stat1 += val1[i1];
1791           stat2 += val2[i2];
1792           i1++;
1793           i2++;
1794         }
1795       }
1796       sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
1797 
1798       break;
1799 
1800     case GK_CSR_AMIN:
1801       sim = stat1 = stat2 = 0.0;
1802       i1 = i2 = 0;
1803       while (i1<nind1 && i2<nind2) {
1804         if (i1 == nind1) {
1805           stat2 += val2[i2];
1806           i2++;
1807         }
1808         else if (i2 == nind2) {
1809           stat1 += val1[i1];
1810           i1++;
1811         }
1812         else if (ind1[i1] < ind2[i2]) {
1813           stat1 += val1[i1];
1814           i1++;
1815         }
1816         else if (ind1[i1] > ind2[i2]) {
1817           stat2 += val2[i2];
1818           i2++;
1819         }
1820         else {
1821           sim   += gk_min(val1[i1],val2[i2]);
1822           stat1 += val1[i1];
1823           stat2 += val2[i2];
1824           i1++;
1825           i2++;
1826         }
1827       }
1828       sim = (stat1 > 0.0 ? sim/stat1 : 0.0);
1829 
1830       break;
1831 
1832     default:
1833       gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
1834       return -1;
1835   }
1836 
1837   return sim;
1838 
1839 }
1840 
1841 
1842 /*************************************************************************/
1843 /*! Finds the n most similar rows (neighbors) to the query using cosine
1844     similarity.
1845 
1846     \param mat the matrix itself
1847     \param nqterms is the number of columns in the query
1848     \param qind is the list of query columns
1849     \param qval is the list of correspodning query weights
1850     \param simtype is the type of similarity and is one of GK_CSR_COS,
1851            GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN
1852     \param nsim is the maximum number of requested most similar rows.
1853            If -1 is provided, then everything is returned unsorted.
1854     \param minsim is the minimum similarity of the requested most
1855            similar rows
1856     \param hits is the result set. This array should be at least
1857            of length nsim.
1858     \param i_marker is an array of size equal to the number of rows
1859            whose values are initialized to -1. If NULL is provided
1860            then this array is allocated and freed internally.
1861     \param i_cand is an array of size equal to the number of rows.
1862            If NULL is provided then this array is allocated and freed
1863            internally.
1864     \returns the number of identified most similar rows, which can be
1865              smaller than the requested number of nnbrs in those cases
1866              in which there are no sufficiently many neighbors.
1867 */
1868 /**************************************************************************/
gk_csr_GetSimilarRows(gk_csr_t * mat,int nqterms,int * qind,float * qval,int simtype,int nsim,float minsim,gk_fkv_t * hits,int * i_marker,gk_fkv_t * i_cand)1869 int gk_csr_GetSimilarRows(gk_csr_t *mat, int nqterms, int *qind,
1870         float *qval, int simtype, int nsim, float minsim, gk_fkv_t *hits,
1871         int *i_marker, gk_fkv_t *i_cand)
1872 {
1873   ssize_t i, ii, j, k;
1874   int nrows, ncols, ncand;
1875   ssize_t *colptr;
1876   int *colind, *marker;
1877   float *colval, *rnorms, mynorm, *rsums, mysum;
1878   gk_fkv_t *cand;
1879 
1880   if (nqterms == 0)
1881     return 0;
1882 
1883   nrows  = mat->nrows;
1884   ncols  = mat->ncols;
1885   colptr = mat->colptr;
1886   colind = mat->colind;
1887   colval = mat->colval;
1888 
1889   marker = (i_marker ? i_marker : gk_ismalloc(nrows, -1, "gk_csr_SimilarRows: marker"));
1890   cand   = (i_cand   ? i_cand   : gk_fkvmalloc(nrows, "gk_csr_SimilarRows: cand"));
1891 
1892   switch (simtype) {
1893     case GK_CSR_COS:
1894       for (ncand=0, ii=0; ii<nqterms; ii++) {
1895         i = qind[ii];
1896         if (i < ncols) {
1897           for (j=colptr[i]; j<colptr[i+1]; j++) {
1898             k = colind[j];
1899             if (marker[k] == -1) {
1900               cand[ncand].val = k;
1901               cand[ncand].key = 0;
1902               marker[k]       = ncand++;
1903             }
1904             cand[marker[k]].key += colval[j]*qval[ii];
1905           }
1906         }
1907       }
1908       break;
1909 
1910     case GK_CSR_JAC:
1911       for (ncand=0, ii=0; ii<nqterms; ii++) {
1912         i = qind[ii];
1913         if (i < ncols) {
1914           for (j=colptr[i]; j<colptr[i+1]; j++) {
1915             k = colind[j];
1916             if (marker[k] == -1) {
1917               cand[ncand].val = k;
1918               cand[ncand].key = 0;
1919               marker[k]       = ncand++;
1920             }
1921             cand[marker[k]].key += colval[j]*qval[ii];
1922           }
1923         }
1924       }
1925 
1926       rnorms = mat->rnorms;
1927       mynorm = gk_fdot(nqterms, qval, 1, qval, 1);
1928 
1929       for (i=0; i<ncand; i++)
1930         cand[i].key = cand[i].key/(rnorms[cand[i].val]+mynorm-cand[i].key);
1931       break;
1932 
1933     case GK_CSR_MIN:
1934       for (ncand=0, ii=0; ii<nqterms; ii++) {
1935         i = qind[ii];
1936         if (i < ncols) {
1937           for (j=colptr[i]; j<colptr[i+1]; j++) {
1938             k = colind[j];
1939             if (marker[k] == -1) {
1940               cand[ncand].val = k;
1941               cand[ncand].key = 0;
1942               marker[k]       = ncand++;
1943             }
1944             cand[marker[k]].key += gk_min(colval[j], qval[ii]);
1945           }
1946         }
1947       }
1948 
1949       rsums = mat->rsums;
1950       mysum = gk_fsum(nqterms, qval, 1);
1951 
1952       for (i=0; i<ncand; i++)
1953         cand[i].key = cand[i].key/(rsums[cand[i].val]+mysum-cand[i].key);
1954       break;
1955 
1956     /* Assymetric MIN  similarity */
1957     case GK_CSR_AMIN:
1958       for (ncand=0, ii=0; ii<nqterms; ii++) {
1959         i = qind[ii];
1960         if (i < ncols) {
1961           for (j=colptr[i]; j<colptr[i+1]; j++) {
1962             k = colind[j];
1963             if (marker[k] == -1) {
1964               cand[ncand].val = k;
1965               cand[ncand].key = 0;
1966               marker[k]       = ncand++;
1967             }
1968             cand[marker[k]].key += gk_min(colval[j], qval[ii]);
1969           }
1970         }
1971       }
1972 
1973       mysum = gk_fsum(nqterms, qval, 1);
1974 
1975       for (i=0; i<ncand; i++)
1976         cand[i].key = cand[i].key/mysum;
1977       break;
1978 
1979     default:
1980       gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
1981       return -1;
1982   }
1983 
1984   /* go and prune the hits that are bellow minsim */
1985   for (j=0, i=0; i<ncand; i++) {
1986     marker[cand[i].val] = -1;
1987     if (cand[i].key >= minsim)
1988       cand[j++] = cand[i];
1989   }
1990   ncand = j;
1991 
1992   if (nsim == -1 || nsim >= ncand) {
1993     nsim = ncand;
1994   }
1995   else {
1996     nsim = gk_min(nsim, ncand);
1997     gk_dfkvkselect(ncand, nsim, cand);
1998     gk_fkvsortd(nsim, cand);
1999   }
2000 
2001   gk_fkvcopy(nsim, cand, hits);
2002 
2003   if (i_marker == NULL)
2004     gk_free((void **)&marker, LTERM);
2005   if (i_cand == NULL)
2006     gk_free((void **)&cand, LTERM);
2007 
2008   return nsim;
2009 }
2010 
2011