1 %{
2 #include <string.h>
3 #include "deflex.h"
4 #ifdef DMALLOC
5 #include <dmalloc.h>
6 #endif
7
8 char *token = NULL; /* pointer to token */
9 char *s = NULL; /* to return WHOLE hyphenated-word */
10
11 YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
12
13 int lrlimit = -1; /* for limiting read from filehandle ( -1 - unlimited read ) */
14 int bytestoread = 0; /* for limiting read from filehandle */
15
16 /* redefine macro for read limited length */
17 #define YY_INPUT(buf,result,max_size) \
18 if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) { \
19 int c = '*', n; \
20 for ( n = 0; n < max_size && \
21 (c = getc( fts_yyin )) != EOF && c != '\n'; ++n ) \
22 buf[n] = (char) c; \
23 if ( c == '\n' ) \
24 buf[n++] = (char) c; \
25 if ( c == EOF && ferror( fts_yyin ) ) \
26 YY_FATAL_ERROR( "input in flex scanner failed" ); \
27 result = n; \
28 } else { \
29 if ( lrlimit == 0 ) \
30 result=YY_NULL; \
31 else { \
32 if ( lrlimit>0 ) { \
33 bytestoread = ( lrlimit > max_size ) ? max_size : lrlimit; \
34 lrlimit -= bytestoread; \
35 } else \
36 bytestoread = max_size; \
37 if ( ((result = fread( buf, 1, bytestoread, fts_yyin )) == 0) \
38 && ferror( fts_yyin ) ) \
39 YY_FATAL_ERROR( "input in flex scanner failed" ); \
40 } \
41 }
42
43
44 %}
45
46 %option nodefault
47 %option nounput
48 %option 8bit
49
50 /* parser's state for parsing hyphenated-word */
51 %x DELIM
52 /* parser's state for parsing URL*/
53 %x URL
54 %x SERVER
55
56 /* parser's state for parsing TAGS */
57 %x INTAG
58 %x QINTAG
59 %x INCOMMENT
60 %x INSCRIPT
61
62 /* cyrillic koi8 char */
63 CYRALNUM [0-9\xc0-\xdf\xe0-\xff\xa3\xb3]
64 CYRALPHA [\xc0-\xdf\xe0-\xff\xa3\xb3]
65 ALPHA [a-zA-Z\xc0-\xdf\xe0-\xff\xa3\xb3]
66 ALNUM [0-9a-zA-Z\xc0-\xdf\xe0-\xff\xa3\xb3]
67
68
69 HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
70 URI [-_[:alnum:]/%,\.;=&?#]+
71
72 %%
73
74 "<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
75
76 <INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
77 BEGIN INITIAL;
78 *fts_yytext=' '; *(fts_yytext+1) = '\0';
79 token = fts_yytext;
80 return SPACE;
81 }
82
83 "<!--" { BEGIN INCOMMENT; }
84
85 <INCOMMENT>"-->" {
86 BEGIN INITIAL;
87 *fts_yytext=' '; *(fts_yytext+1) = '\0';
88 token = fts_yytext;
89 return SPACE;
90 }
91
92
93 "<"[\![:alpha:]] { BEGIN INTAG; }
94
95 "</"[[:alpha:]] { BEGIN INTAG; }
96
97 <INTAG>"\"" { BEGIN QINTAG; }
98
99 <QINTAG>"\\\"" ;
100
101 <QINTAG>"\"" { BEGIN INTAG; }
102
103 <INTAG>">" {
104 BEGIN INITIAL;
105 token = fts_yytext;
106 *fts_yytext=' '; *(fts_yytext+1) = '\0';
107 token = fts_yytext;
108 return TAG;
109 }
110
111 <QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
112
113 \&(quot|amp|nbsp|lt|gt)\; {
114 token = fts_yytext;
115 return HTMLENTITY;
116 }
117
118 \&\#[0-9][0-9]?[0-9]?\; {
119 token = fts_yytext;
120 return HTMLENTITY;
121 }
122
123 [-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
124 token = fts_yytext;
125 return EMAIL;
126 }
127
128 [+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
129 token = fts_yytext;
130 return SCIENTIFIC;
131 }
132
133 [0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
134 token = fts_yytext;
135 return VERSIONNUMBER;
136 }
137
138 [+-]?[0-9]+\.[0-9]+ {
139 token = fts_yytext;
140 return DECIMAL;
141 }
142
143 [+-][0-9]+ {
144 token = fts_yytext;
145 return SIGNEDINT;
146 }
147
148 <DELIM,INITIAL>[0-9]+ {
149 token = fts_yytext;
150 return UNSIGNEDINT;
151 }
152
153 http"://" {
154 BEGIN URL;
155 token = fts_yytext;
156 return HTTP;
157 }
158
159 ftp"://" {
160 BEGIN URL;
161 token = fts_yytext;
162 return HTTP;
163 }
164
165 <URL,INITIAL>{HOSTNAME}[/:]{URI} {
166 BEGIN SERVER;
167 if (s) { free(s); s=NULL; }
168 s = strdup( fts_yytext );
169 yyless( 0 );
170 token = s;
171 return FURL;
172 }
173
174 <SERVER,URL,INITIAL>{HOSTNAME} {
175 token = fts_yytext;
176 return HOST;
177 }
178
179 <SERVER>[/:]{URI} {
180 token = fts_yytext;
181 return URI;
182 }
183
184 [[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
185 token = fts_yytext;
186 return FILEPATH;
187 }
188
189 ({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
190 BEGIN DELIM;
191 if (s) { free(s); s=NULL; }
192 s = strdup( fts_yytext );
193 yyless( 0 );
194 token = s;
195 return CYRHYPHENWORD;
196 }
197
198 ([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
199 BEGIN DELIM;
200 if (s) { free(s); s=NULL; }
201 s = strdup( fts_yytext );
202 yyless( 0 );
203 token = s;
204 return LATHYPHENWORD;
205 }
206
207 ({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
208 BEGIN DELIM;
209 if (s) { free(s); s=NULL; }
210 s = strdup( fts_yytext );
211 yyless( 0 );
212 token = s;
213 return HYPHENWORD;
214 }
215
216 <DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
217 token = fts_yytext;
218 return VERSIONNUMBER;
219 }
220
221 <DELIM>\+?[0-9]+\.[0-9]+ {
222 token = fts_yytext;
223 return DECIMAL;
224 }
225
226 <DELIM>{CYRALPHA}+ /* one word in composite-word */ {
227 token = fts_yytext;
228 return CYRPARTHYPHENWORD;
229 }
230
231 <DELIM>[[:alpha:]]+ /* one word in composite-word */ {
232 token = fts_yytext;
233 return LATPARTHYPHENWORD;
234 }
235
236 <DELIM>{ALNUM}+ /* one word in composite-word */ {
237 token = fts_yytext;
238 return PARTHYPHENWORD;
239 }
240
241 <DELIM>- {
242 token = fts_yytext;
243 return SPACE;
244 }
245
246 <DELIM,SERVER,URL>.|\n /* return in basic state */ {
247 BEGIN INITIAL;
248 yyless( 0 );
249 }
250
251 {CYRALPHA}+ /* normal word */ {
252 token = fts_yytext;
253 return CYRWORD;
254 }
255
256 [[:alpha:]]+ /* normal word */ {
257 token = fts_yytext;
258 return LATWORD;
259 }
260
261 {ALNUM}+ /* normal word */ {
262 token = fts_yytext;
263 return UWORD;
264 }
265
266 [ \r\n\t]+ {
267 token = fts_yytext;
268 return SPACE;
269 }
270
271 . {
272 token = fts_yytext;
273 return SPACE;
274 }
275
276 %%
277
278 int fts_yywrap(void) {
279 return 1;
280 }
281
282 /* clearing after parsing from string */
end_parse()283 void end_parse() {
284 if (s) { free(s); s=NULL; }
285 fts_yy_delete_buffer( buf );
286 buf = NULL;
287 }
288
289 /* start parse from string */
start_parse_str(char * str)290 void start_parse_str(char* str) {
291 if (buf) end_parse();
292 buf = fts_yy_scan_string( str );
293 fts_yy_switch_to_buffer( buf );
294 BEGIN INITIAL;
295 }
296
297 /* start parse from filehandle */
start_parse_fh(FILE * fh,int limit)298 void start_parse_fh( FILE* fh, int limit ) {
299 if (buf) end_parse();
300 lrlimit = ( limit ) ? limit : -1;
301 buf = fts_yy_create_buffer( fh, YY_BUF_SIZE );
302 fts_yy_switch_to_buffer( buf );
303 BEGIN INITIAL;
304 }
305
306
307