1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1996-2011 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                                                                      *
19 ***********************************************************************/
20 #pragma prototyped
21 /*
22  * Glenn Fowler
23  * AT&T Research
24  */
25 
26 static const char usage[] =
27 "[-?\n@(#)$Id: bb2tok (AT&T Research) 2007-12-19 $\n]"
28 USAGE_LICENSE
29 "[+NAME?bb2tok - convert bb html to tokens]"
30 "[+DESCRIPTION?\bbb2tok\b extracts tokens from input \bhtml\b \afile\as. "
31     "If \afile\a is not specified then the standard input is read. The "
32     "\bhtml\b parse is rudimentary; don't use \bbb2tok\b to detect valid "
33     "\bhtml\b files.]"
34 
35 "\n"
36 "\n[ file ... ]\n"
37 "\n"
38 
39 "[+SEE ALSO?\bhtml2db\b(1), \bhtml2rtf\b(1)]"
40 ;
41 
42 #include <ast.h>
43 #include <ctype.h>
44 #include <error.h>
45 
46 #define LINK		0
47 #define NAME		1
48 #define HEADER		2
49 #define BODY		3
50 #define QUOTE		4
51 #define CODE		5
52 #define LABEL		6
53 #define LINE		7
54 
55 typedef struct Header_s
56 {
57 	char*		in;
58 	char*		out;
59 	int		lex;
60 	int		unary;
61 } Header_t;
62 
63 static const	Header_t	header[] =
64 {
65 	"a",		"link/",	LINK,		1,
66 	"name",		"name",		NAME,		0,
67 	"postdetails",	"header",	HEADER,		0,
68 	"postbody",	"body",		BODY,		0,
69 	"quote",	"quote",	QUOTE,		0,
70 	"code",		"code",		CODE,		0,
71 	"genmed",	"label",	LABEL,		0,
72 	"line",		"line/",	LINE,		1,
73 };
74 
75 typedef struct State_s
76 {
77 	Header_t*	prev;
78 	int		push;
79 	int		keep;
80 	int		last;
81 	unsigned char*	lex;
82 } State_t;
83 
84 static void
token(State_t * state,Sfio_t * op,const char * text,const Header_t * head,int push)85 token(State_t* state, Sfio_t* op, const char* text, const Header_t* head, int push)
86 {
87 	if (!head)
88 	{
89 		if (state->keep)
90 		{
91 			if (*state->lex == LABEL && (streq(text, ":") || streq(text, "Code") || streq(text, "wrote")))
92 				return;
93 			if (state->prev)
94 			{
95 				sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
96 				state->prev = 0;
97 			}
98 			sfputr(op, text, *state->lex == HEADER ? ' ' : '\n');
99 		}
100 	}
101 	else if (push)
102 	{
103 		if (state->prev)
104 		{
105 			if (head->lex == LINK && state->prev->lex == NAME && state->push)
106 				return;
107 			if (head->lex == LINE && state->prev->lex == HEADER && !state->push)
108 				return;
109 			if (head->lex == HEADER && push && state->prev->lex == HEADER && !state->push)
110 			{
111 				state->prev = 0;
112 				return;
113 			}
114 			if (state->keep && (state->prev->lex != head->lex || !head->unary && state->push))
115 				sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
116 			if (head->lex == LINE && state->prev->lex == BODY && !state->push)
117 				state->keep = 0;
118 		}
119 		switch (head->lex)
120 		{
121 		case CODE:
122 			state->prev = 0;
123 			sfprintf(op, "<%s>", head->out);
124 			return;
125 		case NAME:
126 			state->keep = 1;
127 			break;
128 		}
129 		state->prev = (Header_t*)head;
130 		state->push = push;
131 	}
132 	else
133 	{
134 		if (state->keep && state->prev)
135 		{
136 			if (state->prev->lex == head->lex && state->push)
137 			{
138 				state->prev = 0;
139 				return;
140 			}
141 			sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
142 		}
143 		state->prev = (Header_t*)head;
144 		state->push = push;
145 	}
146 }
147 
148 #define TOKEN(sp,op,tok,t)	do { if (t > tok) { *t = 0; token(sp, op, t = tok, 0, 0); } } while (0)
149 #define PUSH(sp,op,h)		token(sp,op,0,h,1)
150 #define POP(sp,op,h)		token(sp,op,0,h,0)
151 
152 static void
parse(const char * path,Sfio_t * ip,Sfio_t * op)153 parse(const char* path, Sfio_t* ip, Sfio_t* op)
154 {
155 	register int		c;
156 	register int		i;
157 	register int		k;
158 	register int		q;
159 	register int		n;
160 	register int		x;
161 	register int		level;
162 	register char*		e;
163 	register char*		s;
164 	register char*		t;
165 	const Header_t*		h;
166 
167 	char			tag[256];
168 	char			tok[4 * 1024];
169 	unsigned char		lex[4 * 1024];
170 	const Header_t*		block[4 * 1024];
171 
172 	State_t			state;
173 
174 	state.prev = (Header_t*)&header[*(state.lex = lex) = LINE];
175 	state.push = 1;
176 	state.keep = 0;
177 	t = tok;
178 	k = q = n = level = 0;
179 	for (;;)
180 	{
181 		switch (c = sfgetc(ip))
182 		{
183 		case EOF:
184 			TOKEN(&state, op, tok, t);
185 			break;
186 		case '<':
187 			TOKEN(&state, op, tok, t);
188 			x = 0;
189 			s = tag;
190 			for (;;)
191 			{
192 				switch (c = sfgetc(ip))
193 				{
194 				case EOF:
195 					TOKEN(&state, op, tok, t);
196 					return;
197 				case '"':
198 					if (!q)
199 						q = c;
200 					else if (q == c)
201 						q = 0;
202 					goto keep;
203 				case '!':
204 					if (s != tag)
205 						goto keep;
206 					x = 1;
207 					continue;
208 				case '\n':
209 					x = 1;
210 					continue;
211 				case '>':
212 					if (!q)
213 						break;
214 					/*FALLTHROUGH*/
215 				default:
216 				keep:
217 					if (!x && s < &tag[sizeof(tag)-1])
218 						*s++ = isupper(c) ? tolower(c) : c;
219 					continue;
220 				}
221 				break;
222 			}
223 			*s = 0;
224 			s = tag;
225 			if (!k)
226 			{
227 				if (s[0] == 'b' && s[1] == 'o' && s[2] == 'd' && s[3] == 'y' && (!s[4] || s[4] == ' '))
228 					k = 1;
229 				else
230 					continue;
231 			}
232 			if (s[0] == 's' && s[1] == 'p' && s[2] == 'a' && s[3] == 'n' && (!s[4] || s[4] == ' ') && (s += 4) || s[0] == 't' && s[1] == 'd' && (!s[2] || s[2] == ' ') && (s += 2))
233 			{
234 				h = 0;
235 				if (s[0] == ' ' && strneq(s + 1, "class=\"", 7))
236 				{
237 					for (e = s += 8; *e && *e != '"'; e++);
238 					*e = 0;
239 					for (i = 0; i < elementsof(header); i++)
240 						if (streq(s, header[i].in))
241 						{
242 							h = &header[i];
243 							if (level < elementsof(block))
244 							{
245 								PUSH(&state, op, h);
246 								n++;
247 							}
248 							break;
249 						}
250 				}
251 				if (level < elementsof(block) && (block[level] = h))
252 					*++state.lex = h->lex;
253 				level++;
254 			}
255 			else if (s[0] == '/' && (s[1] == 's' && s[2] == 'p' && s[3] == 'a' && s[4] == 'n' && !s[5] || s[1] == 't' && s[2] == 'd' && !s[3]))
256 			{
257 				if (level > 0)
258 				{
259 					level--;
260 					if (level < elementsof(block) && (h = block[level]))
261 					{
262 						POP(&state, op, h);
263 						n--;
264 						state.lex--;
265 					}
266 				}
267 			}
268 			else if (n)
269 			{
270 				if (s[0] == 'b' && s[1] == 'r' && (!s[2] || s[2] == ' ' || s[2] == '/'))
271 				{
272 					if ((c = sfgetc(ip)) == '\n')
273 						continue;
274 					sfungetc(ip, c);
275 				}
276 				if (s[0] == 'a' && s[1] == ' ')
277 					PUSH(&state, op, &header[LINK]);
278 				else
279 				{
280 					c = ' ';
281 					goto space;
282 				}
283 			}
284 			continue;
285 		case '&':
286 			while ((c = sfgetc(ip)) != EOF && isalnum(c));
287 			c = ' ';
288 			goto space;
289 		case ':':
290 		case ';':
291 		case ',':
292 		case '.':
293 			if (*state.lex == CODE)
294 				goto code;
295 			TOKEN(&state, op, tok, t);
296 			*t++ = c;
297 			TOKEN(&state, op, tok, t);
298 			continue;
299 		case ' ':
300 		case '\t':
301 		case '\r':
302 		case '\v':
303 		space:
304 			if (*state.lex == CODE)
305 				goto code;
306 			TOKEN(&state, op, tok, t);
307 			continue;
308 		case '\n':
309 			if (*state.lex == CODE)
310 				goto code;
311 			TOKEN(&state, op, tok, t);
312 			PUSH(&state, op, &header[LINE]);
313 			continue;
314 		default:
315 			if (*state.lex == CODE)
316 				goto code;
317 			if (t >= &tok[sizeof(tok) - 1])
318 				TOKEN(&state, op, tok, t);
319 			*t++ = c;
320 			continue;
321 		code:
322 			sfputc(op, c);
323 			state.last = c;
324 			continue;
325 		}
326 		break;
327 	}
328 }
329 
330 int
main(int argc,char ** argv)331 main(int argc, char** argv)
332 {
333 	register char*		s;
334 	register Sfio_t*	ip;
335 
336 	NoP(argc);
337 	error_info.id = "bb2tok";
338 	for (;;)
339 	{
340 		switch (optget(argv, usage))
341 		{
342 		case '?':
343 			error(ERROR_USAGE|4, "%s", opt_info.arg);
344 			continue;
345 		case ':':
346 			error(2, "%s", opt_info.arg);
347 			continue;
348 		}
349 		break;
350 	}
351 	argv += opt_info.index;
352 	if (error_info.errors)
353 		error(ERROR_USAGE|4, "%s", optusage(NiL));
354 	do
355 	{
356 		if (!(s = *argv) || streq(s, "-") || streq(s, "/dev/stdin") || streq(s, "/dev/fd/0"))
357 		{
358 			s = "/dev/stdin";
359 			ip = sfstdin;
360 		}
361 		else if (!(ip = sfopen(NiL, s, "r")))
362 		{
363 			error(ERROR_SYSTEM|2, "%s: cannot read", s);
364 			continue;
365 		}
366 		parse(s, ip, sfstdout);
367 		if (ip != sfstdin)
368 			sfclose(ip);
369 	} while (*argv && *++argv);
370 	return error_info.errors != 0;
371 }
372