1 /* 2 * Copyright (C) 1999-2004 Etymon Systems, Inc. 3 * 4 * Authors: Nassib Nassar 5 */ 6 7 #include "erc.h" 8 #include "fdef.h" 9 #include "util.h" 10 #include <stdio.h> 11 #include <ctype.h> 12 #include <string.h> 13 14 15 /* returns 0 if everything went OK */ dc_erc_init(ETYMON_AF_DC_INIT * dc_init)16int dc_erc_init(ETYMON_AF_DC_INIT* dc_init) { 17 return 0; 18 } 19 20 dc_erc_next_char(ETYMON_DOCBUF * docbuf,etymon_af_off_t * offset)21unsigned char dc_erc_next_char(ETYMON_DOCBUF* docbuf, 22 etymon_af_off_t* offset) { 23 (*offset)++; 24 return etymon_docbuf_next_char(docbuf); 25 } 26 27 28 /* returns 0 if everything went OK */ dc_erc_index(ETYMON_AF_DC_INDEX * dc_index)29int dc_erc_index(ETYMON_AF_DC_INDEX* dc_index) { 30 ETYMON_DOCBUF* docbuf = dc_index->docbuf; 31 ETYMON_AF_INDEX_ADD_DOC add_doc; 32 ETYMON_AF_INDEX_ADD_WORD add_word; 33 unsigned char word[ETYMON_MAX_WORD_SIZE]; 34 Uint2 fields[ETYMON_MAX_FIELD_NEST]; 35 ETYMON_AF_DC_SPLIT* split_list = dc_index->split_list; 36 ETYMON_AF_DC_SPLIT* split_p = split_list; 37 unsigned char ch; 38 unsigned char old_ch; 39 int good; 40 int x; 41 etymon_af_off_t offset = 0; 42 ETYMON_AF_FDEF_RESOLVE_FIELD resolve_field; 43 44 /* return if the document size is 0 */ 45 if (docbuf->data_len == 0) { 46 return 0; 47 } 48 49 /* initialize variables */ 50 add_doc.key = NULL; 51 add_doc.filename = docbuf->fn; 52 add_doc.parent = 0; 53 add_doc.dclass_id = dc_index->dclass_id; 54 add_doc.state = dc_index->state; 55 56 add_word.word = word; 57 add_word.fields = fields; 58 memset(fields, 0, ETYMON_MAX_FIELD_NEST * 2); 59 add_word.state = dc_index->state; 60 61 resolve_field.word = word; 62 resolve_field.state = dc_index->state; 63 64 add_doc.end = 0; 65 66 while (split_p) { 67 68 /* add document */ 69 add_doc.begin = add_doc.end; 70 add_doc.end = split_p->end; 71 add_word.doc_id = etymon_af_index_add_doc(&add_doc); 72 73 /* parse out the words */ 74 add_word.word_number = 1; 75 fields[0] = 0; 76 old_ch = '\n'; 77 while ( (docbuf->eof == 0) && (offset < add_doc.end) ) { 78 79 ch = '\0'; 80 good = 0; 81 82 /* loop past non alphanumeric chars */ 83 while ( (docbuf->eof == 0) && (offset < add_doc.end) && (isalnum(ch = 84 dc_erc_next_char(docbuf, &offset)) == 0) ) { 85 old_ch = ch; 86 } 87 88 if ( (docbuf->eof == 0) && (offset < add_doc.end) ) { 89 90 /* otherwise ch is the first char of the word */ 91 word[0] = ch; 92 93 /* add the rest of the chars to the word */ 94 x = 1; 95 while ( 96 (x < (ETYMON_MAX_WORD_SIZE - 1)) && (docbuf->eof == 0) && 97 (offset < add_doc.end) && 98 ( ((good = isalnum(ch = 99 dc_erc_next_char(docbuf, &offset))) != 0) || 100 (good = (ch == '.')) || 101 (good = (ch == '-')) ) 102 ) { 103 /* add ch to the word */ 104 word[x++] = ch; 105 } 106 107 /* iterate past any remaining chars (if the word was truncated because it was too long to fit in word[] */ 108 if (good != 0) { 109 /* the char was good, so we either ran out of room or hit eof/eod */ 110 while ( 111 (docbuf->eof == 0) && 112 (offset < add_doc.end) && 113 ( (isalnum(ch = 114 dc_erc_next_char(docbuf, &offset)) != 0) || 115 (ch == '.') || 116 (ch == '-') ) 117 ) { 118 } 119 } 120 121 /* truncate if last character is '.' */ 122 if (word[x - 1] == '.') { 123 x--; 124 } 125 126 /* terminate the word[] string */ 127 word[x] = '\0'; 128 129 /* determine if the word is a field 130 name or an indexable word */ 131 if (old_ch == '\n') { 132 /* field name */ 133 x = etymon_af_fdef_resolve_field(&resolve_field); 134 fields[0] = x; 135 } else { 136 /* indexable word */ 137 etymon_tolower((char*)word); 138 if (etymon_af_index_add_word(&add_word) == -1) { 139 return -1; 140 } 141 add_word.word_number++; 142 } 143 144 old_ch = ch; 145 } 146 } 147 148 /* next split */ 149 split_p = split_p->next; 150 151 } 152 153 return 0; 154 } 155