1 /*!
2 \file
3 \brief A simple frequent itemset discovery program to test GKlib's routines
4 
5 \date 6/12/2008
6 \author George
7 \version \verbatim $Id: fis.c 11075 2011-11-11 22:31:52Z karypis $ \endverbatim
8 */
9 
10 #include <GKlib.h>
11 
12 /*************************************************************************/
13 /*! Data structures for the code */
14 /*************************************************************************/
15 typedef struct {
16   ssize_t minlen, maxlen;
17   ssize_t minfreq, maxfreq;
18   char *filename;
19   int silent;
20   ssize_t nitemsets;
21   char *clabelfile;
22   char **clabels;
23 } params_t;
24 
25 /*************************************************************************/
26 /*! Constants */
27 /*************************************************************************/
28 #define CMD_MINLEN      1
29 #define CMD_MAXLEN      2
30 #define CMD_MINFREQ     3
31 #define CMD_MAXFREQ     4
32 #define CMD_SILENT      5
33 #define CMD_CLABELFILE  6
34 #define CMD_HELP        10
35 
36 
37 /*************************************************************************/
38 /*! Local variables */
39 /*************************************************************************/
40 static struct gk_option long_options[] = {
41   {"minlen",        1,      0,      CMD_MINLEN},
42   {"maxlen",        1,      0,      CMD_MAXLEN},
43   {"minfreq",       1,      0,      CMD_MINFREQ},
44   {"maxfreq",       1,      0,      CMD_MAXFREQ},
45   {"silent",        0,      0,      CMD_SILENT},
46   {"clabels",       1,      0,      CMD_CLABELFILE},
47   {"help",          0,      0,      CMD_HELP},
48   {0,               0,      0,      0}
49 };
50 
51 
52 /*-------------------------------------------------------------------*/
53 /* Mini help  */
54 /*-------------------------------------------------------------------*/
55 static char helpstr[][100] = {
56 " ",
57 "Usage: fis [options] <mat-file>",
58 " ",
59 " Required parameters",
60 "  mat-file",
61 "     The name of the file storing the transactions. The file is in ",
62 "     Cluto's .mat format.",
63 " ",
64 " Optional parameters",
65 "  -minlen=int",
66 "     Specifies the minimum length of the patterns. [default: 1]",
67 " ",
68 "  -maxlen=int",
69 "     Specifies the maximum length of the patterns. [default: none]",
70 " ",
71 "  -minfreq=int",
72 "     Specifies the minimum frequency of the patterns. [default: 10]",
73 " ",
74 "  -maxfreq=int",
75 "     Specifies the maximum frequency of the patterns. [default: none]",
76 " ",
77 "  -silent",
78 "     Does not print the discovered itemsets.",
79 " ",
80 "  -clabels=filename",
81 "     Specifies the name of the file that stores the column labels.",
82 " ",
83 "  -help",
84 "     Prints this message.",
85 ""
86 };
87 
88 static char shorthelpstr[][100] = {
89 " ",
90 "   Usage: fis [options] <mat-file>",
91 "          use 'fis -help' for a summary of the options.",
92 ""
93 };
94 
95 
96 
97 /*************************************************************************/
98 /*! Function prototypes */
99 /*************************************************************************/
100 void print_init_info(params_t *params, gk_csr_t *mat);
101 void print_final_info(params_t *params);
102 params_t *parse_cmdline(int argc, char *argv[]);
103 void print_an_itemset(void *stateptr, int nitems, int *itemind,
104                       int ntrans, int *tranind);
105 
106 
107 /*************************************************************************/
108 /*! the entry point */
109 /**************************************************************************/
main(int argc,char * argv[])110 int main(int argc, char *argv[])
111 {
112   ssize_t i;
113   char line[8192];
114   FILE *fpin;
115   params_t *params;
116   gk_csr_t *mat;
117 
118   params = parse_cmdline(argc, argv);
119   params->nitemsets = 0;
120 
121   /* read the data */
122   mat = gk_csr_Read(params->filename, GK_CSR_FMT_CLUTO, 1, 1);
123   gk_csr_CreateIndex(mat, GK_CSR_COL);
124 
125   /* read the column labels */
126   params->clabels = (char **)gk_malloc(mat->ncols*sizeof(char *), "main: clabels");
127   if (params->clabelfile == NULL) {
128     for (i=0; i<mat->ncols; i++) {
129       sprintf(line, "%zd", i);
130       params->clabels[i] = gk_strdup(line);
131     }
132   }
133   else {
134     fpin = gk_fopen(params->clabelfile, "r", "main: fpin");
135     for (i=0; i<mat->ncols; i++) {
136       if (fgets(line, 8192, fpin) == NULL)
137         errexit("Failed on fgets.\n");
138       params->clabels[i] = gk_strdup(gk_strtprune(line, " \n\t"));
139     }
140     gk_fclose(fpin);
141   }
142 
143 
144   print_init_info(params, mat);
145 
146   gk_find_frequent_itemsets(mat->nrows, mat->rowptr, mat->rowind,
147       params->minfreq, params->maxfreq, params->minlen, params->maxlen,
148       &print_an_itemset, (void *)params);
149 
150   printf("Total itemsets found: %zd\n", params->nitemsets);
151 
152   print_final_info(params);
153 }
154 
155 
156 
157 /*************************************************************************/
158 /*! This function prints run parameters */
159 /*************************************************************************/
print_init_info(params_t * params,gk_csr_t * mat)160 void print_init_info(params_t *params, gk_csr_t *mat)
161 {
162   printf("*******************************************************************************\n");
163   printf(" fis\n\n");
164   printf("Matrix Information ---------------------------------------------------------\n");
165   printf(" input file=%s, [%d, %d, %zd]\n",
166       params->filename, mat->nrows, mat->ncols, mat->rowptr[mat->nrows]);
167 
168   printf("\n");
169   printf("Options --------------------------------------------------------------------\n");
170   printf(" minlen=%zd, maxlen=%zd, minfeq=%zd, maxfreq=%zd\n",
171       params->minlen, params->maxlen, params->minfreq, params->maxfreq);
172 
173   printf("\n");
174   printf("Finding patterns... -----------------------------------------------------\n");
175 }
176 
177 
178 /*************************************************************************/
179 /*! This function prints final statistics */
180 /*************************************************************************/
print_final_info(params_t * params)181 void print_final_info(params_t *params)
182 {
183   printf("\n");
184   printf("Memory Usage Information -----------------------------------------------------\n");
185   printf("   Maximum memory used:              %10zd bytes\n", (ssize_t) gk_GetMaxMemoryUsed());
186   printf("   Current memory used:              %10zd bytes\n", (ssize_t) gk_GetCurMemoryUsed());
187   printf("********************************************************************************\n");
188 }
189 
190 
191 /*************************************************************************/
192 /*! This is the entry point of the command-line argument parser */
193 /*************************************************************************/
parse_cmdline(int argc,char * argv[])194 params_t *parse_cmdline(int argc, char *argv[])
195 {
196   int i;
197   int c, option_index;
198   params_t *params;
199 
200   params = (params_t *)gk_malloc(sizeof(params_t), "parse_cmdline: params");
201 
202   /* initialize the params data structure */
203   params->minlen     = 1;
204   params->maxlen     = -1;
205   params->minfreq    = 10;
206   params->maxfreq    = -1;
207   params->silent     = 0;
208   params->filename   = NULL;
209   params->clabelfile = NULL;
210 
211 
212   /* Parse the command line arguments  */
213   while ((c = gk_getopt_long_only(argc, argv, "", long_options, &option_index)) != -1) {
214     switch (c) {
215       case CMD_MINLEN:
216         if (gk_optarg) params->minlen = atoi(gk_optarg);
217         break;
218       case CMD_MAXLEN:
219         if (gk_optarg) params->maxlen = atoi(gk_optarg);
220         break;
221       case CMD_MINFREQ:
222         if (gk_optarg) params->minfreq = atoi(gk_optarg);
223         break;
224       case CMD_MAXFREQ:
225         if (gk_optarg) params->maxfreq = atoi(gk_optarg);
226         break;
227 
228       case CMD_SILENT:
229         params->silent = 1;
230         break;
231 
232       case CMD_CLABELFILE:
233         if (gk_optarg) params->clabelfile = gk_strdup(gk_optarg);
234         break;
235 
236       case CMD_HELP:
237         for (i=0; strlen(helpstr[i]) > 0; i++)
238           printf("%s\n", helpstr[i]);
239         exit(0);
240         break;
241       case '?':
242       default:
243         printf("Illegal command-line option(s)\nUse %s -help for a summary of the options.\n", argv[0]);
244         exit(0);
245     }
246   }
247 
248   if (argc-gk_optind != 1) {
249     printf("Unrecognized parameters.");
250     for (i=0; strlen(shorthelpstr[i]) > 0; i++)
251       printf("%s\n", shorthelpstr[i]);
252     exit(0);
253   }
254 
255   params->filename = gk_strdup(argv[gk_optind++]);
256 
257   if (!gk_fexists(params->filename))
258     errexit("input file %s does not exist.\n", params->filename);
259 
260   return params;
261 }
262 
263 
264 
265 /*************************************************************************/
266 /*! This is the callback function for the itemset discovery routine */
267 /*************************************************************************/
print_an_itemset(void * stateptr,int nitems,int * itemids,int ntrans,int * transids)268 void print_an_itemset(void *stateptr, int nitems, int *itemids, int ntrans,
269          int *transids)
270 {
271   ssize_t i;
272   params_t *params;
273 
274   params = (params_t *)stateptr;
275   params->nitemsets++;
276 
277   if (!params->silent) {
278     printf("%4zd %4d %4d => ", params->nitemsets, nitems, ntrans);
279     for (i=0; i<nitems; i++)
280       printf(" %s", params->clabels[itemids[i]]);
281     printf("\n");
282     for (i=0; i<ntrans; i++)
283       printf(" %d\n", transids[i]);
284     printf("\n");
285   }
286 }
287