1 /*!
2 \file
3 \brief A simple frequent itemset discovery program to test GKlib's routines
4
5 \date 6/12/2008
6 \author George
7 \version \verbatim $Id: fis.c 11075 2011-11-11 22:31:52Z karypis $ \endverbatim
8 */
9
10 #include <GKlib.h>
11
12 /*************************************************************************/
13 /*! Data structures for the code */
14 /*************************************************************************/
15 typedef struct {
16 ssize_t minlen, maxlen;
17 ssize_t minfreq, maxfreq;
18 char *filename;
19 int silent;
20 ssize_t nitemsets;
21 char *clabelfile;
22 char **clabels;
23 } params_t;
24
25 /*************************************************************************/
26 /*! Constants */
27 /*************************************************************************/
28 #define CMD_MINLEN 1
29 #define CMD_MAXLEN 2
30 #define CMD_MINFREQ 3
31 #define CMD_MAXFREQ 4
32 #define CMD_SILENT 5
33 #define CMD_CLABELFILE 6
34 #define CMD_HELP 10
35
36
37 /*************************************************************************/
38 /*! Local variables */
39 /*************************************************************************/
40 static struct gk_option long_options[] = {
41 {"minlen", 1, 0, CMD_MINLEN},
42 {"maxlen", 1, 0, CMD_MAXLEN},
43 {"minfreq", 1, 0, CMD_MINFREQ},
44 {"maxfreq", 1, 0, CMD_MAXFREQ},
45 {"silent", 0, 0, CMD_SILENT},
46 {"clabels", 1, 0, CMD_CLABELFILE},
47 {"help", 0, 0, CMD_HELP},
48 {0, 0, 0, 0}
49 };
50
51
52 /*-------------------------------------------------------------------*/
53 /* Mini help */
54 /*-------------------------------------------------------------------*/
55 static char helpstr[][100] = {
56 " ",
57 "Usage: fis [options] <mat-file>",
58 " ",
59 " Required parameters",
60 " mat-file",
61 " The name of the file storing the transactions. The file is in ",
62 " Cluto's .mat format.",
63 " ",
64 " Optional parameters",
65 " -minlen=int",
66 " Specifies the minimum length of the patterns. [default: 1]",
67 " ",
68 " -maxlen=int",
69 " Specifies the maximum length of the patterns. [default: none]",
70 " ",
71 " -minfreq=int",
72 " Specifies the minimum frequency of the patterns. [default: 10]",
73 " ",
74 " -maxfreq=int",
75 " Specifies the maximum frequency of the patterns. [default: none]",
76 " ",
77 " -silent",
78 " Does not print the discovered itemsets.",
79 " ",
80 " -clabels=filename",
81 " Specifies the name of the file that stores the column labels.",
82 " ",
83 " -help",
84 " Prints this message.",
85 ""
86 };
87
88 static char shorthelpstr[][100] = {
89 " ",
90 " Usage: fis [options] <mat-file>",
91 " use 'fis -help' for a summary of the options.",
92 ""
93 };
94
95
96
97 /*************************************************************************/
98 /*! Function prototypes */
99 /*************************************************************************/
100 void print_init_info(params_t *params, gk_csr_t *mat);
101 void print_final_info(params_t *params);
102 params_t *parse_cmdline(int argc, char *argv[]);
103 void print_an_itemset(void *stateptr, int nitems, int *itemind,
104 int ntrans, int *tranind);
105
106
107 /*************************************************************************/
108 /*! the entry point */
109 /**************************************************************************/
main(int argc,char * argv[])110 int main(int argc, char *argv[])
111 {
112 ssize_t i;
113 char line[8192];
114 FILE *fpin;
115 params_t *params;
116 gk_csr_t *mat;
117
118 params = parse_cmdline(argc, argv);
119 params->nitemsets = 0;
120
121 /* read the data */
122 mat = gk_csr_Read(params->filename, GK_CSR_FMT_CLUTO, 1, 1);
123 gk_csr_CreateIndex(mat, GK_CSR_COL);
124
125 /* read the column labels */
126 params->clabels = (char **)gk_malloc(mat->ncols*sizeof(char *), "main: clabels");
127 if (params->clabelfile == NULL) {
128 for (i=0; i<mat->ncols; i++) {
129 sprintf(line, "%zd", i);
130 params->clabels[i] = gk_strdup(line);
131 }
132 }
133 else {
134 fpin = gk_fopen(params->clabelfile, "r", "main: fpin");
135 for (i=0; i<mat->ncols; i++) {
136 if (fgets(line, 8192, fpin) == NULL)
137 errexit("Failed on fgets.\n");
138 params->clabels[i] = gk_strdup(gk_strtprune(line, " \n\t"));
139 }
140 gk_fclose(fpin);
141 }
142
143
144 print_init_info(params, mat);
145
146 gk_find_frequent_itemsets(mat->nrows, mat->rowptr, mat->rowind,
147 params->minfreq, params->maxfreq, params->minlen, params->maxlen,
148 &print_an_itemset, (void *)params);
149
150 printf("Total itemsets found: %zd\n", params->nitemsets);
151
152 print_final_info(params);
153 }
154
155
156
157 /*************************************************************************/
158 /*! This function prints run parameters */
159 /*************************************************************************/
print_init_info(params_t * params,gk_csr_t * mat)160 void print_init_info(params_t *params, gk_csr_t *mat)
161 {
162 printf("*******************************************************************************\n");
163 printf(" fis\n\n");
164 printf("Matrix Information ---------------------------------------------------------\n");
165 printf(" input file=%s, [%d, %d, %zd]\n",
166 params->filename, mat->nrows, mat->ncols, mat->rowptr[mat->nrows]);
167
168 printf("\n");
169 printf("Options --------------------------------------------------------------------\n");
170 printf(" minlen=%zd, maxlen=%zd, minfeq=%zd, maxfreq=%zd\n",
171 params->minlen, params->maxlen, params->minfreq, params->maxfreq);
172
173 printf("\n");
174 printf("Finding patterns... -----------------------------------------------------\n");
175 }
176
177
178 /*************************************************************************/
179 /*! This function prints final statistics */
180 /*************************************************************************/
print_final_info(params_t * params)181 void print_final_info(params_t *params)
182 {
183 printf("\n");
184 printf("Memory Usage Information -----------------------------------------------------\n");
185 printf(" Maximum memory used: %10zd bytes\n", (ssize_t) gk_GetMaxMemoryUsed());
186 printf(" Current memory used: %10zd bytes\n", (ssize_t) gk_GetCurMemoryUsed());
187 printf("********************************************************************************\n");
188 }
189
190
191 /*************************************************************************/
192 /*! This is the entry point of the command-line argument parser */
193 /*************************************************************************/
parse_cmdline(int argc,char * argv[])194 params_t *parse_cmdline(int argc, char *argv[])
195 {
196 int i;
197 int c, option_index;
198 params_t *params;
199
200 params = (params_t *)gk_malloc(sizeof(params_t), "parse_cmdline: params");
201
202 /* initialize the params data structure */
203 params->minlen = 1;
204 params->maxlen = -1;
205 params->minfreq = 10;
206 params->maxfreq = -1;
207 params->silent = 0;
208 params->filename = NULL;
209 params->clabelfile = NULL;
210
211
212 /* Parse the command line arguments */
213 while ((c = gk_getopt_long_only(argc, argv, "", long_options, &option_index)) != -1) {
214 switch (c) {
215 case CMD_MINLEN:
216 if (gk_optarg) params->minlen = atoi(gk_optarg);
217 break;
218 case CMD_MAXLEN:
219 if (gk_optarg) params->maxlen = atoi(gk_optarg);
220 break;
221 case CMD_MINFREQ:
222 if (gk_optarg) params->minfreq = atoi(gk_optarg);
223 break;
224 case CMD_MAXFREQ:
225 if (gk_optarg) params->maxfreq = atoi(gk_optarg);
226 break;
227
228 case CMD_SILENT:
229 params->silent = 1;
230 break;
231
232 case CMD_CLABELFILE:
233 if (gk_optarg) params->clabelfile = gk_strdup(gk_optarg);
234 break;
235
236 case CMD_HELP:
237 for (i=0; strlen(helpstr[i]) > 0; i++)
238 printf("%s\n", helpstr[i]);
239 exit(0);
240 break;
241 case '?':
242 default:
243 printf("Illegal command-line option(s)\nUse %s -help for a summary of the options.\n", argv[0]);
244 exit(0);
245 }
246 }
247
248 if (argc-gk_optind != 1) {
249 printf("Unrecognized parameters.");
250 for (i=0; strlen(shorthelpstr[i]) > 0; i++)
251 printf("%s\n", shorthelpstr[i]);
252 exit(0);
253 }
254
255 params->filename = gk_strdup(argv[gk_optind++]);
256
257 if (!gk_fexists(params->filename))
258 errexit("input file %s does not exist.\n", params->filename);
259
260 return params;
261 }
262
263
264
265 /*************************************************************************/
266 /*! This is the callback function for the itemset discovery routine */
267 /*************************************************************************/
print_an_itemset(void * stateptr,int nitems,int * itemids,int ntrans,int * transids)268 void print_an_itemset(void *stateptr, int nitems, int *itemids, int ntrans,
269 int *transids)
270 {
271 ssize_t i;
272 params_t *params;
273
274 params = (params_t *)stateptr;
275 params->nitemsets++;
276
277 if (!params->silent) {
278 printf("%4zd %4d %4d => ", params->nitemsets, nitems, ntrans);
279 for (i=0; i<nitems; i++)
280 printf(" %s", params->clabels[itemids[i]]);
281 printf("\n");
282 for (i=0; i<ntrans; i++)
283 printf(" %d\n", transids[i]);
284 printf("\n");
285 }
286 }
287