1 /*!
2  * \file
3  * \brief Frequent/Closed itemset discovery routines
4  *
5  * This file contains the code for finding frequent/closed itemests. These routines
6  * are implemented using a call-back mechanism to deal with the discovered itemsets.
7  *
8  * \date 6/13/2008
9  * \author George Karypis
10  * \version\verbatim $Id: itemsets.c 11075 2011-11-11 22:31:52Z karypis $ \endverbatim
11  */
12 
13 #include <GKlib.h>
14 
15 /*-------------------------------------------------------------*/
16 /*! Data structures for use within this module */
17 /*-------------------------------------------------------------*/
18 typedef struct {
19   int minfreq;  /* the minimum frequency of a pattern */
20   int maxfreq;  /* the maximum frequency of a pattern */
21   int minlen;   /* the minimum length of the requested pattern */
22   int maxlen;   /* the maximum length of the requested pattern */
23   int tnitems;  /* the initial range of the item space */
24 
25   /* the call-back function */
26   void (*callback)(void *stateptr, int nitems, int *itemids, int ntrans, int *transids);
27   void *stateptr;   /* the user-supplied pointer to pass to the callback */
28 
29   /* workspace variables */
30   int *rmarker;
31   gk_ikv_t *cand;
32 } isparams_t;
33 
34 
35 /*-------------------------------------------------------------*/
36 /*! Prototypes for this module */
37 /*-------------------------------------------------------------*/
38 void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat,
39          int preflen, int *prefix);
40 gk_csr_t *itemsets_project_matrix(isparams_t *param, gk_csr_t *mat, int cid);
41 
42 
43 
44 /*************************************************************************/
45 /*! The entry point of the frequent itemset discovery code */
46 /*************************************************************************/
gk_find_frequent_itemsets(int ntrans,ssize_t * tranptr,int * tranind,int minfreq,int maxfreq,int minlen,int maxlen,void (* process_itemset)(void * stateptr,int nitems,int * itemids,int ntrans,int * transids),void * stateptr)47 void gk_find_frequent_itemsets(int ntrans, ssize_t *tranptr, int *tranind,
48         int minfreq, int maxfreq, int minlen, int maxlen,
49         void (*process_itemset)(void *stateptr, int nitems, int *itemids,
50                                 int ntrans, int *transids),
51         void *stateptr)
52 {
53   ssize_t i;
54   gk_csr_t *mat, *pmat;
55   isparams_t params;
56   int *pattern;
57 
58   /* Create the matrix */
59   mat = gk_csr_Create();
60   mat->nrows  = ntrans;
61   mat->ncols  = tranind[gk_iargmax(tranptr[ntrans], tranind)]+1;
62   mat->rowptr = gk_zcopy(ntrans+1, tranptr, gk_zmalloc(ntrans+1, "gk_find_frequent_itemsets: mat.rowptr"));
63   mat->rowind = gk_icopy(tranptr[ntrans], tranind, gk_imalloc(tranptr[ntrans], "gk_find_frequent_itemsets: mat.rowind"));
64   mat->colids = gk_iincset(mat->ncols, 0, gk_imalloc(mat->ncols, "gk_find_frequent_itemsets: mat.colids"));
65 
66   /* Setup the parameters */
67   params.minfreq  = minfreq;
68   params.maxfreq  = (maxfreq == -1 ? mat->nrows : maxfreq);
69   params.minlen   = minlen;
70   params.maxlen   = (maxlen == -1 ? mat->ncols : maxlen);
71   params.tnitems  = mat->ncols;
72   params.callback = process_itemset;
73   params.stateptr = stateptr;
74   params.rmarker  = gk_ismalloc(mat->nrows, 0, "gk_find_frequent_itemsets: rmarker");
75   params.cand     = gk_ikvmalloc(mat->ncols, "gk_find_frequent_itemsets: cand");
76 
77   /* Perform the initial projection */
78   gk_csr_CreateIndex(mat, GK_CSR_COL);
79   pmat = itemsets_project_matrix(&params, mat, -1);
80   gk_csr_Free(&mat);
81 
82   pattern = gk_imalloc(pmat->ncols, "gk_find_frequent_itemsets: pattern");
83   itemsets_find_frequent_itemsets(&params, pmat, 0, pattern);
84 
85   gk_csr_Free(&pmat);
86   gk_free((void **)&pattern, &params.rmarker, &params.cand, LTERM);
87 
88 }
89 
90 
91 
92 /*************************************************************************/
93 /*! The recursive routine for DFS-based frequent pattern discovery */
94 /*************************************************************************/
itemsets_find_frequent_itemsets(isparams_t * params,gk_csr_t * mat,int preflen,int * prefix)95 void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat,
96          int preflen, int *prefix)
97 {
98   ssize_t i;
99   gk_csr_t *cmat;
100 
101   /* Project each frequent column */
102   for (i=0; i<mat->ncols; i++) {
103     prefix[preflen] = mat->colids[i];
104 
105     if (preflen+1 >= params->minlen)
106       (*params->callback)(params->stateptr, preflen+1, prefix,
107            mat->colptr[i+1]-mat->colptr[i], mat->colind+mat->colptr[i]);
108 
109     if (preflen+1 < params->maxlen) {
110       cmat = itemsets_project_matrix(params, mat, i);
111       itemsets_find_frequent_itemsets(params, cmat, preflen+1, prefix);
112       gk_csr_Free(&cmat);
113     }
114   }
115 
116 }
117 
118 
119 /******************************************************************************/
120 /*! This function projects a matrix w.r.t. to a particular column.
121     It performs the following steps:
122     - Determines the length of each column that is remaining
123     - Sorts the columns in increasing length
124     - Creates a column-based version of the matrix with the proper
125       column ordering and renamed rowids.
126  */
127 /*******************************************************************************/
itemsets_project_matrix(isparams_t * params,gk_csr_t * mat,int cid)128 gk_csr_t *itemsets_project_matrix(isparams_t *params, gk_csr_t *mat, int cid)
129 {
130   ssize_t i, j, k, ii, pnnz;
131   int nrows, ncols, pnrows, pncols;
132   ssize_t *colptr, *pcolptr;
133   int *colind, *colids, *pcolind, *pcolids, *rmarker;
134   gk_csr_t *pmat;
135   gk_ikv_t *cand;
136 
137   nrows  = mat->nrows;
138   ncols  = mat->ncols;
139   colptr = mat->colptr;
140   colind = mat->colind;
141   colids = mat->colids;
142 
143   rmarker = params->rmarker;
144   cand    = params->cand;
145 
146 
147   /* Allocate space for the projected matrix based on what you know thus far */
148   pmat = gk_csr_Create();
149   pmat->nrows  = pnrows = (cid == -1 ? nrows : colptr[cid+1]-colptr[cid]);
150 
151 
152   /* Mark the rows that will be kept and determine the prowids */
153   if (cid == -1) { /* Initial projection */
154     gk_iset(nrows, 1, rmarker);
155   }
156   else { /* The other projections */
157     for (i=colptr[cid]; i<colptr[cid+1]; i++)
158       rmarker[colind[i]] = 1;
159   }
160 
161 
162   /* Determine the length of each column that will be left in the projected matrix */
163   for (pncols=0, pnnz=0, i=cid+1; i<ncols; i++) {
164     for (k=0, j=colptr[i]; j<colptr[i+1]; j++) {
165       k += rmarker[colind[j]];
166     }
167     if (k >= params->minfreq && k <= params->maxfreq) {
168       cand[pncols].val   = i;
169       cand[pncols++].key = k;
170       pnnz += k;
171     }
172   }
173 
174   /* Sort the columns in increasing order */
175   gk_ikvsorti(pncols, cand);
176 
177 
178   /* Allocate space for the remaining fields of the projected matrix */
179   pmat->ncols  = pncols;
180   pmat->colids = pcolids = gk_imalloc(pncols, "itemsets_project_matrix: pcolids");
181   pmat->colptr = pcolptr = gk_zmalloc(pncols+1, "itemsets_project_matrix: pcolptr");
182   pmat->colind = pcolind = gk_imalloc(pnnz, "itemsets_project_matrix: pcolind");
183 
184 
185   /* Populate the projected matrix */
186   pcolptr[0] = 0;
187   for (pnnz=0, ii=0; ii<pncols; ii++) {
188     i = cand[ii].val;
189     for (j=colptr[i]; j<colptr[i+1]; j++) {
190       if (rmarker[colind[j]])
191         pcolind[pnnz++] = colind[j];
192     }
193 
194     pcolids[ii] = colids[i];
195     pcolptr[ii+1] = pnnz;
196   }
197 
198 
199   /* Reset the rmarker array */
200   if (cid == -1) { /* Initial projection */
201     gk_iset(nrows, 0, rmarker);
202   }
203   else { /* The other projections */
204     for (i=colptr[cid]; i<colptr[cid+1]; i++)
205       rmarker[colind[i]] = 0;
206   }
207 
208 
209   return pmat;
210 }
211