1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1996-2011 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * *
19 ***********************************************************************/
20 #pragma prototyped
21 /*
22 * Glenn Fowler
23 * AT&T Research
24 */
25
26 static const char usage[] =
27 "[-?\n@(#)$Id: bb2tok (AT&T Research) 2007-12-19 $\n]"
28 USAGE_LICENSE
29 "[+NAME?bb2tok - convert bb html to tokens]"
30 "[+DESCRIPTION?\bbb2tok\b extracts tokens from input \bhtml\b \afile\as. "
31 "If \afile\a is not specified then the standard input is read. The "
32 "\bhtml\b parse is rudimentary; don't use \bbb2tok\b to detect valid "
33 "\bhtml\b files.]"
34
35 "\n"
36 "\n[ file ... ]\n"
37 "\n"
38
39 "[+SEE ALSO?\bhtml2db\b(1), \bhtml2rtf\b(1)]"
40 ;
41
42 #include <ast.h>
43 #include <ctype.h>
44 #include <error.h>
45
46 #define LINK 0
47 #define NAME 1
48 #define HEADER 2
49 #define BODY 3
50 #define QUOTE 4
51 #define CODE 5
52 #define LABEL 6
53 #define LINE 7
54
55 typedef struct Header_s
56 {
57 char* in;
58 char* out;
59 int lex;
60 int unary;
61 } Header_t;
62
63 static const Header_t header[] =
64 {
65 "a", "link/", LINK, 1,
66 "name", "name", NAME, 0,
67 "postdetails", "header", HEADER, 0,
68 "postbody", "body", BODY, 0,
69 "quote", "quote", QUOTE, 0,
70 "code", "code", CODE, 0,
71 "genmed", "label", LABEL, 0,
72 "line", "line/", LINE, 1,
73 };
74
75 typedef struct State_s
76 {
77 Header_t* prev;
78 int push;
79 int keep;
80 int last;
81 unsigned char* lex;
82 } State_t;
83
84 static void
token(State_t * state,Sfio_t * op,const char * text,const Header_t * head,int push)85 token(State_t* state, Sfio_t* op, const char* text, const Header_t* head, int push)
86 {
87 if (!head)
88 {
89 if (state->keep)
90 {
91 if (*state->lex == LABEL && (streq(text, ":") || streq(text, "Code") || streq(text, "wrote")))
92 return;
93 if (state->prev)
94 {
95 sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
96 state->prev = 0;
97 }
98 sfputr(op, text, *state->lex == HEADER ? ' ' : '\n');
99 }
100 }
101 else if (push)
102 {
103 if (state->prev)
104 {
105 if (head->lex == LINK && state->prev->lex == NAME && state->push)
106 return;
107 if (head->lex == LINE && state->prev->lex == HEADER && !state->push)
108 return;
109 if (head->lex == HEADER && push && state->prev->lex == HEADER && !state->push)
110 {
111 state->prev = 0;
112 return;
113 }
114 if (state->keep && (state->prev->lex != head->lex || !head->unary && state->push))
115 sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
116 if (head->lex == LINE && state->prev->lex == BODY && !state->push)
117 state->keep = 0;
118 }
119 switch (head->lex)
120 {
121 case CODE:
122 state->prev = 0;
123 sfprintf(op, "<%s>", head->out);
124 return;
125 case NAME:
126 state->keep = 1;
127 break;
128 }
129 state->prev = (Header_t*)head;
130 state->push = push;
131 }
132 else
133 {
134 if (state->keep && state->prev)
135 {
136 if (state->prev->lex == head->lex && state->push)
137 {
138 state->prev = 0;
139 return;
140 }
141 sfprintf(op, "%s<%s%s>\n", (!state->push && (state->prev->lex == HEADER || state->prev->lex == CODE && state->last != '\n')) ? "\n" : "", state->push ? "" : "/", state->prev->out);
142 }
143 state->prev = (Header_t*)head;
144 state->push = push;
145 }
146 }
147
148 #define TOKEN(sp,op,tok,t) do { if (t > tok) { *t = 0; token(sp, op, t = tok, 0, 0); } } while (0)
149 #define PUSH(sp,op,h) token(sp,op,0,h,1)
150 #define POP(sp,op,h) token(sp,op,0,h,0)
151
152 static void
parse(const char * path,Sfio_t * ip,Sfio_t * op)153 parse(const char* path, Sfio_t* ip, Sfio_t* op)
154 {
155 register int c;
156 register int i;
157 register int k;
158 register int q;
159 register int n;
160 register int x;
161 register int level;
162 register char* e;
163 register char* s;
164 register char* t;
165 const Header_t* h;
166
167 char tag[256];
168 char tok[4 * 1024];
169 unsigned char lex[4 * 1024];
170 const Header_t* block[4 * 1024];
171
172 State_t state;
173
174 state.prev = (Header_t*)&header[*(state.lex = lex) = LINE];
175 state.push = 1;
176 state.keep = 0;
177 t = tok;
178 k = q = n = level = 0;
179 for (;;)
180 {
181 switch (c = sfgetc(ip))
182 {
183 case EOF:
184 TOKEN(&state, op, tok, t);
185 break;
186 case '<':
187 TOKEN(&state, op, tok, t);
188 x = 0;
189 s = tag;
190 for (;;)
191 {
192 switch (c = sfgetc(ip))
193 {
194 case EOF:
195 TOKEN(&state, op, tok, t);
196 return;
197 case '"':
198 if (!q)
199 q = c;
200 else if (q == c)
201 q = 0;
202 goto keep;
203 case '!':
204 if (s != tag)
205 goto keep;
206 x = 1;
207 continue;
208 case '\n':
209 x = 1;
210 continue;
211 case '>':
212 if (!q)
213 break;
214 /*FALLTHROUGH*/
215 default:
216 keep:
217 if (!x && s < &tag[sizeof(tag)-1])
218 *s++ = isupper(c) ? tolower(c) : c;
219 continue;
220 }
221 break;
222 }
223 *s = 0;
224 s = tag;
225 if (!k)
226 {
227 if (s[0] == 'b' && s[1] == 'o' && s[2] == 'd' && s[3] == 'y' && (!s[4] || s[4] == ' '))
228 k = 1;
229 else
230 continue;
231 }
232 if (s[0] == 's' && s[1] == 'p' && s[2] == 'a' && s[3] == 'n' && (!s[4] || s[4] == ' ') && (s += 4) || s[0] == 't' && s[1] == 'd' && (!s[2] || s[2] == ' ') && (s += 2))
233 {
234 h = 0;
235 if (s[0] == ' ' && strneq(s + 1, "class=\"", 7))
236 {
237 for (e = s += 8; *e && *e != '"'; e++);
238 *e = 0;
239 for (i = 0; i < elementsof(header); i++)
240 if (streq(s, header[i].in))
241 {
242 h = &header[i];
243 if (level < elementsof(block))
244 {
245 PUSH(&state, op, h);
246 n++;
247 }
248 break;
249 }
250 }
251 if (level < elementsof(block) && (block[level] = h))
252 *++state.lex = h->lex;
253 level++;
254 }
255 else if (s[0] == '/' && (s[1] == 's' && s[2] == 'p' && s[3] == 'a' && s[4] == 'n' && !s[5] || s[1] == 't' && s[2] == 'd' && !s[3]))
256 {
257 if (level > 0)
258 {
259 level--;
260 if (level < elementsof(block) && (h = block[level]))
261 {
262 POP(&state, op, h);
263 n--;
264 state.lex--;
265 }
266 }
267 }
268 else if (n)
269 {
270 if (s[0] == 'b' && s[1] == 'r' && (!s[2] || s[2] == ' ' || s[2] == '/'))
271 {
272 if ((c = sfgetc(ip)) == '\n')
273 continue;
274 sfungetc(ip, c);
275 }
276 if (s[0] == 'a' && s[1] == ' ')
277 PUSH(&state, op, &header[LINK]);
278 else
279 {
280 c = ' ';
281 goto space;
282 }
283 }
284 continue;
285 case '&':
286 while ((c = sfgetc(ip)) != EOF && isalnum(c));
287 c = ' ';
288 goto space;
289 case ':':
290 case ';':
291 case ',':
292 case '.':
293 if (*state.lex == CODE)
294 goto code;
295 TOKEN(&state, op, tok, t);
296 *t++ = c;
297 TOKEN(&state, op, tok, t);
298 continue;
299 case ' ':
300 case '\t':
301 case '\r':
302 case '\v':
303 space:
304 if (*state.lex == CODE)
305 goto code;
306 TOKEN(&state, op, tok, t);
307 continue;
308 case '\n':
309 if (*state.lex == CODE)
310 goto code;
311 TOKEN(&state, op, tok, t);
312 PUSH(&state, op, &header[LINE]);
313 continue;
314 default:
315 if (*state.lex == CODE)
316 goto code;
317 if (t >= &tok[sizeof(tok) - 1])
318 TOKEN(&state, op, tok, t);
319 *t++ = c;
320 continue;
321 code:
322 sfputc(op, c);
323 state.last = c;
324 continue;
325 }
326 break;
327 }
328 }
329
330 int
main(int argc,char ** argv)331 main(int argc, char** argv)
332 {
333 register char* s;
334 register Sfio_t* ip;
335
336 NoP(argc);
337 error_info.id = "bb2tok";
338 for (;;)
339 {
340 switch (optget(argv, usage))
341 {
342 case '?':
343 error(ERROR_USAGE|4, "%s", opt_info.arg);
344 continue;
345 case ':':
346 error(2, "%s", opt_info.arg);
347 continue;
348 }
349 break;
350 }
351 argv += opt_info.index;
352 if (error_info.errors)
353 error(ERROR_USAGE|4, "%s", optusage(NiL));
354 do
355 {
356 if (!(s = *argv) || streq(s, "-") || streq(s, "/dev/stdin") || streq(s, "/dev/fd/0"))
357 {
358 s = "/dev/stdin";
359 ip = sfstdin;
360 }
361 else if (!(ip = sfopen(NiL, s, "r")))
362 {
363 error(ERROR_SYSTEM|2, "%s: cannot read", s);
364 continue;
365 }
366 parse(s, ip, sfstdout);
367 if (ip != sfstdin)
368 sfclose(ip);
369 } while (*argv && *++argv);
370 return error_info.errors != 0;
371 }
372