1 %{
2 #include <string.h>
3 #include "deflex.h"
4 #ifdef DMALLOC
5 #include <dmalloc.h>
6 #endif
7 
8 char *token = NULL;  /* pointer to token */
9 char *s     = NULL;  /* to return WHOLE hyphenated-word */
10 
11 YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
12 
13 int lrlimit = -1;	/* for limiting read from filehandle ( -1 - unlimited read ) */
14 int bytestoread = 0;	/* for limiting read from filehandle */
15 
16 /* redefine macro for read limited length */
17 #define YY_INPUT(buf,result,max_size) \
18 	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) { \
19                 int c = '*', n; \
20                 for ( n = 0; n < max_size && \
21                              (c = getc( fts_yyin )) != EOF && c != '\n'; ++n ) \
22                         buf[n] = (char) c; \
23                 if ( c == '\n' ) \
24                         buf[n++] = (char) c; \
25                 if ( c == EOF && ferror( fts_yyin ) ) \
26                         YY_FATAL_ERROR( "input in flex scanner failed" ); \
27                 result = n; \
28         }  else { \
29 		if ( lrlimit == 0 ) \
30 			result=YY_NULL; \
31 		else { \
32 			if ( lrlimit>0 ) { \
33 				bytestoread = ( lrlimit > max_size ) ? max_size : lrlimit; \
34 				lrlimit -= bytestoread; \
35 			} else \
36 				bytestoread = max_size; \
37         		if ( ((result = fread( buf, 1, bytestoread, fts_yyin )) == 0) \
38                   		&& ferror( fts_yyin ) ) \
39                 		YY_FATAL_ERROR( "input in flex scanner failed" ); \
40 		} \
41 	}
42 
43 
44 %}
45 
46 %option nodefault
47 %option nounput
48 %option 8bit
49 
50 /* parser's state for parsing hyphenated-word */
51 %x DELIM
52 /* parser's state for parsing URL*/
53 %x URL
54 %x SERVER
55 
56 /* parser's state for parsing TAGS */
57 %x INTAG
58 %x QINTAG
59 %x INCOMMENT
60 %x INSCRIPT
61 
62 /* cyrillic koi8 char */
63 CYRALNUM	[0-9\xc0-\xdf\xe0-\xff\xa3\xb3]
64 CYRALPHA	[\xc0-\xdf\xe0-\xff\xa3\xb3]
65 ALPHA		[a-zA-Z\xc0-\xdf\xe0-\xff\xa3\xb3]
66 ALNUM		[0-9a-zA-Z\xc0-\xdf\xe0-\xff\xa3\xb3]
67 
68 
69 HOSTNAME	([-_[:alnum:]]+\.)+[[:alpha:]]+
70 URI		[-_[:alnum:]/%,\.;=&?#]+
71 
72 %%
73 
74 "<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
75 
76 <INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
77 	BEGIN INITIAL;
78 	*fts_yytext=' '; *(fts_yytext+1) = '\0';
79 	token = fts_yytext;
80 	return SPACE;
81 }
82 
83 "<!--"	{ BEGIN INCOMMENT; }
84 
85 <INCOMMENT>"-->"	{
86 	BEGIN INITIAL;
87 	*fts_yytext=' '; *(fts_yytext+1) = '\0';
88 	token = fts_yytext;
89 	return SPACE;
90 }
91 
92 
93 "<"[\![:alpha:]]	{ BEGIN INTAG; }
94 
95 "</"[[:alpha:]]	{ BEGIN INTAG; }
96 
97 <INTAG>"\""	{ BEGIN QINTAG; }
98 
99 <QINTAG>"\\\""	;
100 
101 <QINTAG>"\""	{ BEGIN INTAG; }
102 
103 <INTAG>">"	{
104 	BEGIN INITIAL;
105 	token = fts_yytext;
106 	*fts_yytext=' '; *(fts_yytext+1) = '\0';
107 	token = fts_yytext;
108 	return TAG;
109 }
110 
111 <QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n 	;
112 
113 \&(quot|amp|nbsp|lt|gt)\;   {
114 	token = fts_yytext;
115 	return HTMLENTITY;
116 }
117 
118 \&\#[0-9][0-9]?[0-9]?\; {
119 	token = fts_yytext;
120 	return HTMLENTITY;
121 }
122 
123 [-_\.[:alnum:]]+@{HOSTNAME}  /* Emails */ {
124 	token = fts_yytext;
125 	return EMAIL;
126 }
127 
128 [+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+  /* float */ 	{
129 	token = fts_yytext;
130 	return SCIENTIFIC;
131 }
132 
133 [0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
134 	token = fts_yytext;
135 	return VERSIONNUMBER;
136 }
137 
138 [+-]?[0-9]+\.[0-9]+ {
139 	token = fts_yytext;
140 	return DECIMAL;
141 }
142 
143 [+-][0-9]+ {
144 	token = fts_yytext;
145 	return SIGNEDINT;
146 }
147 
148 <DELIM,INITIAL>[0-9]+ {
149 	token = fts_yytext;
150 	return UNSIGNEDINT;
151 }
152 
153 http"://"        {
154 	BEGIN URL;
155 	token = fts_yytext;
156 	return HTTP;
157 }
158 
159 ftp"://"        {
160 	BEGIN URL;
161 	token = fts_yytext;
162 	return HTTP;
163 }
164 
165 <URL,INITIAL>{HOSTNAME}[/:]{URI} {
166 	BEGIN SERVER;
167 	if (s) { free(s); s=NULL; }
168 	s = strdup( fts_yytext );
169 	yyless( 0 );
170 	token = s;
171 	return FURL;
172 }
173 
174 <SERVER,URL,INITIAL>{HOSTNAME} {
175 	token = fts_yytext;
176 	return HOST;
177 }
178 
179 <SERVER>[/:]{URI} 	{
180 	token = fts_yytext;
181 	return URI;
182 }
183 
184 [[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
185 	token = fts_yytext;
186 	return FILEPATH;
187 }
188 
189 ({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */	{
190 	BEGIN DELIM;
191 	if (s) { free(s); s=NULL; }
192 	s = strdup( fts_yytext );
193 	yyless( 0 );
194 	token = s;
195 	return CYRHYPHENWORD;
196 }
197 
198 ([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */	{
199 	 BEGIN DELIM;
200 	if (s) { free(s); s=NULL; }
201 	s = strdup( fts_yytext );
202 	yyless( 0 );
203 	token = s;
204 	return LATHYPHENWORD;
205 }
206 
207 ({ALNUM}+-)+{ALNUM}+ /* composite-word */	{
208 	BEGIN DELIM;
209 	if (s) { free(s); s=NULL; }
210 	s = strdup( fts_yytext );
211 	yyless( 0 );
212 	token = s;
213 	return HYPHENWORD;
214 }
215 
216 <DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
217 	token = fts_yytext;
218 	return VERSIONNUMBER;
219 }
220 
221 <DELIM>\+?[0-9]+\.[0-9]+ {
222 	token = fts_yytext;
223 	return DECIMAL;
224 }
225 
226 <DELIM>{CYRALPHA}+  /* one word in composite-word */	{
227 	token = fts_yytext;
228 	return CYRPARTHYPHENWORD;
229 }
230 
231 <DELIM>[[:alpha:]]+  /* one word in composite-word */	{
232 	token = fts_yytext;
233 	return LATPARTHYPHENWORD;
234 }
235 
236 <DELIM>{ALNUM}+  /* one word in composite-word */	{
237 	token = fts_yytext;
238 	return PARTHYPHENWORD;
239 }
240 
241 <DELIM>-  {
242 	token = fts_yytext;
243 	return SPACE;
244 }
245 
246 <DELIM,SERVER,URL>.|\n /* return in basic state */	{
247 	BEGIN INITIAL;
248 	yyless( 0 );
249 }
250 
251 {CYRALPHA}+ /* normal word */	{
252 	token = fts_yytext;
253 	return CYRWORD;
254 }
255 
256 [[:alpha:]]+ /* normal word */	{
257 	token = fts_yytext;
258 	return LATWORD;
259 }
260 
261 {ALNUM}+ /* normal word */	{
262 	token = fts_yytext;
263 	return UWORD;
264 }
265 
266 [ \r\n\t]+ {
267 	token = fts_yytext;
268 	return SPACE;
269 }
270 
271 . {
272 	token = fts_yytext;
273 	return SPACE;
274 }
275 
276 %%
277 
278 int fts_yywrap(void) {
279 	return 1;
280 }
281 
282 /* clearing after parsing from string */
end_parse()283 void end_parse() {
284 	if (s) { free(s); s=NULL; }
285 	fts_yy_delete_buffer( buf );
286 	buf = NULL;
287 }
288 
289 /* start parse from string */
start_parse_str(char * str)290 void start_parse_str(char* str) {
291 	if (buf) end_parse();
292 	buf = fts_yy_scan_string( str );
293 	fts_yy_switch_to_buffer( buf );
294 	BEGIN INITIAL;
295 }
296 
297 /* start parse from filehandle */
start_parse_fh(FILE * fh,int limit)298 void start_parse_fh( FILE* fh, int limit ) {
299 	if (buf) end_parse();
300 	lrlimit = ( limit ) ? limit : -1;
301 	buf = fts_yy_create_buffer( fh, YY_BUF_SIZE );
302 	fts_yy_switch_to_buffer( buf );
303 	BEGIN INITIAL;
304 }
305 
306 
307