1 /*
2  *  Copyright (C) 1999-2004 Etymon Systems, Inc.
3  *
4  *  Authors:  Nassib Nassar
5  */
6 
7 #include "erc.h"
8 #include "fdef.h"
9 #include "util.h"
10 #include <stdio.h>
11 #include <ctype.h>
12 #include <string.h>
13 
14 
15 /* returns 0 if everything went OK */
dc_erc_init(ETYMON_AF_DC_INIT * dc_init)16 int dc_erc_init(ETYMON_AF_DC_INIT* dc_init) {
17 	return 0;
18 }
19 
20 
dc_erc_next_char(ETYMON_DOCBUF * docbuf,etymon_af_off_t * offset)21 unsigned char dc_erc_next_char(ETYMON_DOCBUF* docbuf,
22 				       etymon_af_off_t* offset) {
23 	(*offset)++;
24 	return etymon_docbuf_next_char(docbuf);
25 }
26 
27 
28 /* returns 0 if everything went OK */
dc_erc_index(ETYMON_AF_DC_INDEX * dc_index)29 int dc_erc_index(ETYMON_AF_DC_INDEX* dc_index) {
30 	ETYMON_DOCBUF* docbuf = dc_index->docbuf;
31 	ETYMON_AF_INDEX_ADD_DOC add_doc;
32 	ETYMON_AF_INDEX_ADD_WORD add_word;
33 	unsigned char word[ETYMON_MAX_WORD_SIZE];
34 	Uint2 fields[ETYMON_MAX_FIELD_NEST];
35 	ETYMON_AF_DC_SPLIT* split_list = dc_index->split_list;
36 	ETYMON_AF_DC_SPLIT* split_p = split_list;
37 	unsigned char ch;
38 	unsigned char old_ch;
39 	int good;
40 	int x;
41 	etymon_af_off_t offset = 0;
42 	ETYMON_AF_FDEF_RESOLVE_FIELD resolve_field;
43 
44 	/* return if the document size is 0 */
45 	if (docbuf->data_len == 0) {
46 		return 0;
47 	}
48 
49 	/* initialize variables */
50 	add_doc.key = NULL;
51 	add_doc.filename = docbuf->fn;
52 	add_doc.parent = 0;
53 	add_doc.dclass_id = dc_index->dclass_id;
54 	add_doc.state = dc_index->state;
55 
56 	add_word.word = word;
57 	add_word.fields = fields;
58 	memset(fields, 0, ETYMON_MAX_FIELD_NEST * 2);
59 	add_word.state = dc_index->state;
60 
61 	resolve_field.word = word;
62 	resolve_field.state = dc_index->state;
63 
64 	add_doc.end = 0;
65 
66 	while (split_p) {
67 
68 		/* add document */
69 		add_doc.begin = add_doc.end;
70 		add_doc.end = split_p->end;
71 		add_word.doc_id = etymon_af_index_add_doc(&add_doc);
72 
73 		/* parse out the words */
74 		add_word.word_number = 1;
75 		fields[0] = 0;
76 		old_ch = '\n';
77 		while ( (docbuf->eof == 0) && (offset < add_doc.end) ) {
78 
79 			ch = '\0';
80 			good = 0;
81 
82 			/* loop past non alphanumeric chars */
83 			while ( (docbuf->eof == 0) && (offset < add_doc.end) && (isalnum(ch =
84 							       dc_erc_next_char(docbuf, &offset)) == 0) ) {
85 				old_ch = ch;
86 			}
87 
88 			if ( (docbuf->eof == 0) && (offset < add_doc.end) ) {
89 
90 				/* otherwise ch is the first char of the word */
91 				word[0] = ch;
92 
93 				/* add the rest of the chars to the word */
94 				x = 1;
95 				while (
96 					(x < (ETYMON_MAX_WORD_SIZE - 1)) && (docbuf->eof == 0) &&
97 					(offset < add_doc.end) &&
98 					( ((good = isalnum(ch =
99 							   dc_erc_next_char(docbuf, &offset))) != 0) ||
100 					  (good = (ch == '.')) ||
101 					  (good = (ch == '-')) )
102 					) {
103 					/* add ch to the word */
104 					word[x++] = ch;
105 				}
106 
107 				/* iterate past any remaining chars (if the word was truncated because it was too long to fit in word[] */
108 				if (good != 0) {
109 					/* the char was good, so we either ran out of room or hit eof/eod */
110 					while (
111 						(docbuf->eof == 0) &&
112 						(offset < add_doc.end) &&
113 						( (isalnum(ch =
114 							   dc_erc_next_char(docbuf, &offset)) != 0) ||
115 						  (ch == '.') ||
116 						  (ch == '-') )
117 						) {
118 					}
119 				}
120 
121 				/* truncate if last character is '.' */
122 				if (word[x - 1] == '.') {
123 					x--;
124 				}
125 
126 				/* terminate the word[] string */
127 				word[x] = '\0';
128 
129 				/* determine if the word is a field
130 				   name or an indexable word */
131 				if (old_ch == '\n') {
132 					/* field name */
133 					x = etymon_af_fdef_resolve_field(&resolve_field);
134 					fields[0] = x;
135 				} else {
136 					/* indexable word */
137 					etymon_tolower((char*)word);
138 					if (etymon_af_index_add_word(&add_word) == -1) {
139 						return -1;
140 					}
141 					add_word.word_number++;
142 				}
143 
144 				old_ch = ch;
145 			}
146 		}
147 
148 		/* next split */
149 		split_p = split_p->next;
150 
151 	}
152 
153 	return 0;
154 }
155