1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 4 -*- */
2 /*
3 * Copyright (C) 2003-2010 Shaun McCance <shaunm@gnome.org>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
17 *
18 * Author: Shaun McCance <shaunm@gnome.org>
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24
25 #include <glib.h>
26 #include <glib/gi18n.h>
27 #include <libxml/tree.h>
28 #include <libxml/xpath.h>
29 #include <gio/gio.h>
30 #include <gio/gunixinputstream.h>
31 #include <string.h>
32 #include <math.h>
33
34 #include "yelp-error.h"
35 #include "yelp-man-parser.h"
36
37 #define MAN_FONTS 8
38
39 /* The format has two copies of the title like MAN(1) at the top,
40 * possibly with a string of text in between for the collection.
41 *
42 * Start with the parser on START, then HAVE_TITLE when we've read the
43 * first word with parentheses. At that point, stick new words into
44 * the "collection" tag. Then finally switch to BODY when we've seen
45 * the second copy of the one with parentheses.
46 */
47 typedef enum ManParserState
48 {
49 START,
50 HAVE_TITLE,
51 BODY
52 } ManParserState;
53
54 /* See parse_body_text for how this is used. */
55 typedef enum ManParserSectionState
56 {
57 SECTION_TITLE,
58 SECTION_BODY
59 } ManParserSectionState;
60
61 struct _YelpManParser {
62 xmlDocPtr doc; /* The top-level XML document */
63 xmlNodePtr header; /* The header node */
64 xmlNodePtr section_node; /* The current section */
65 xmlNodePtr sheet_node; /* The current sheet */
66
67 GDataInputStream *stream; /* The GIO input stream to read from */
68 gchar *buffer; /* The buffer, line at a time */
69 gsize length; /* The buffer length */
70
71 gchar *section; /* The name of the current section */
72
73 /* The width and height of a character according to troff. */
74 guint char_width;
75 guint char_height;
76
77 /* Count the number of lines we've parsed (needed to get prologue) */
78 guint line_no;
79
80 /* The x f k name command sets the k'th register to be name. */
81 gchar* font_registers[MAN_FONTS];
82
83 /* The current font. Should be the index of one of the
84 * font_registers. Starts at 0 (of course!)
85 */
86 guint current_font;
87
88 /* See description of ManParserState above */
89 ManParserState state;
90
91 /* Vertical and horizontal position as far as the troff output is
92 * concerned. (Measured from top-left).
93 */
94 guint vpos, hpos;
95
96 /* Text accumulator (needed since it comes through in dribs &
97 * drabs...) */
98 GString *accumulator;
99
100 /* See parse_body_text for how this is used. */
101 ManParserSectionState section_state;
102
103 /* The indent of the current sheet */
104 guint sheet_indent;
105
106 /* Set to TRUE if there's been a newline since the last text was
107 * parsed. */
108 gboolean newline;
109
110 /* Count the number of 'N' lines we've seen since the last h
111 * command. This is because for some reason N doesn't
112 * automatically move the position forward. Thus immediately after
113 * one, you see a h24 or the like. Unless there's a space. Then it
114 * might be wh48. This is set in parse_N (obviously) and used in
115 * parse_h.
116 */
117 guint N_count;
118
119 /* Keep track of whether the last character was a space. We can't
120 * just do this by looking at the last char of accumulator,
121 * because if there's a font change, it gets zeroed. This gets set
122 * to TRUE by parse_w and is FALSE the rest of the time.
123 */
124 gboolean last_char_was_space;
125
126 /* Keep track of the size of the last vertical jump - used to tell
127 * whether we need to insert extra space above a line.
128 */
129 gint last_vertical_jump;
130
131 /* The title we read earlier (eg 'Foo(2)') */
132 gchar *title_str;
133 };
134
135 static gboolean parser_parse_line (YelpManParser *parser, GError **error);
136 static gboolean parse_prologue_line (YelpManParser *parser, GError **error);
137
138 /* Parsers for different types of line */
139 typedef gboolean (*LineParser)(YelpManParser *, GError **);
140 #define DECLARE_LINE_PARSER(name) \
141 static gboolean (name) (YelpManParser *parser, GError **error);
142
143 DECLARE_LINE_PARSER (parse_xf)
144 DECLARE_LINE_PARSER (parse_f)
145 DECLARE_LINE_PARSER (parse_V)
146 DECLARE_LINE_PARSER (parse_H)
147 DECLARE_LINE_PARSER (parse_v)
148 DECLARE_LINE_PARSER (parse_h)
149 DECLARE_LINE_PARSER (parse_text)
150 DECLARE_LINE_PARSER (parse_w)
151 DECLARE_LINE_PARSER (parse_body_text)
152 DECLARE_LINE_PARSER (parse_n)
153 DECLARE_LINE_PARSER (parse_N)
154 DECLARE_LINE_PARSER (parse_C)
155 DECLARE_LINE_PARSER (parse_p)
156
157 /* Declare a sort of alist registry of parsers for different lines. */
158 struct LineParsePair
159 {
160 const gchar *prefix;
161 LineParser handler;
162 };
163 static struct LineParsePair line_parsers[] = {
164 { "x f", parse_xf }, { "f", parse_f },
165 { "V", parse_V }, { "H", parse_H },
166 { "v", parse_v }, { "h", parse_h },
167 { "t", parse_text },
168 { "w", parse_w },
169 { "n", parse_n },
170 { "N", parse_N },
171 { "C", parse_C },
172 { "p", parse_p },
173 { NULL, NULL }
174 };
175
176 /******************************************************************************/
177 /* Parser helper functions (managing the state of the various parsing
178 * bits) */
179 static void finish_span (YelpManParser *parser);
180 static guint dx_to_em_count (YelpManParser *parser, guint dx);
181 static void append_nbsps (YelpManParser *parser, guint k);
182 static void deal_with_newlines (YelpManParser *parser);
183 static void new_sheet (YelpManParser *parser);
184 static void register_title (YelpManParser *parser,
185 const gchar* name, const gchar* section);
186 static void right_truncate_common (gchar *dst, const gchar *src);
187 static gboolean cheeky_call_parse_line (YelpManParser *parser,
188 GError **error,
189 gchar first_char,
190 const gchar *text);
191 static void cleanup_parsed_page (YelpManParser *parser);
192 static gboolean parse_last_line (YelpManParser *parser, gchar* line);
193 static void unicode_strstrip (gchar *str);
194
195 /*
196 A link_inserter takes
197 (1) an array of offsets for the different spans within the string
198 (2) the match info from the regex match
199
200 It's then responsible for mangling the XML tree to insert the actual
201 link. Finally, it should return the offset into the string of the
202 end of what it's just dealt with. If necessary, it should also fix
203 up offsets to point correctly at the last node inserted.
204 */
205 typedef struct {
206 gsize start, end;
207 xmlNodePtr elt;
208 } offset_elt_pair;
209
210 typedef gsize (*link_inserter)(offset_elt_pair *,
211 const GMatchInfo *);
212
213 static void fixup_links (YelpManParser *parser,
214 const GRegex *matcher,
215 link_inserter inserter);
216
217 static gsize man_link_inserter (offset_elt_pair *offsets,
218 const GMatchInfo *match_info);
219 static gsize http_link_inserter (offset_elt_pair *offsets,
220 const GMatchInfo *match_info);
221
222 /******************************************************************************/
223 /* Translations for the 'C' command. This is indeed hackish, but the
224 * -Tutf8 output doesn't seem to give include files so we can do this
225 * at runtime :-(
226 *
227 * On my machine, this data's at /usr/share/groff/current/tmac/ in
228 * latin1.tmac, unicode.tmac and I worked out the lq and rq from
229 * running man: I'm not sure where that comes from!
230 */
231 struct StringPair
232 {
233 const gchar *from;
234 gunichar to;
235 };
236 static const struct StringPair char_translations[] = {
237 { "r!", 161 },
238 { "ct", 162 },
239 { "Po", 163 },
240 { "Cs", 164 },
241 { "Ye", 165 },
242 { "bb", 166 },
243 { "sc", 167 },
244 { "ad", 168 },
245 { "co", 169 },
246 { "Of", 170 },
247 { "Fo", 171 },
248 { "tno", 172 },
249 { "%", 173 },
250 { "rg", 174 },
251 { "a-", 175 },
252 { "de", 176 },
253 { "t+-", 177 },
254 { "S2", 178 },
255 { "S3", 179 },
256 { "aa", 180 },
257 { "mc", 181 },
258 { "ps", 182 },
259 { "pc", 183 },
260 { "ac", 184 },
261 { "S1", 185 },
262 { "Om", 186 },
263 { "Fc", 187 },
264 { "14", 188 },
265 { "12", 189 },
266 { "34", 190 },
267 { "r?", 191 },
268 { "`A", 192 },
269 { "'A", 193 },
270 { "^A", 194 },
271 { "~A", 195 },
272 { ":A", 196 },
273 { "oA", 197 },
274 { "AE", 198 },
275 { ",C", 199 },
276 { "`E", 200 },
277 { "'E", 201 },
278 { "^E", 202 },
279 { ":E", 203 },
280 { "`I", 204 },
281 { "'I", 205 },
282 { "^I", 206 },
283 { ":I", 207 },
284 { "-D", 208 },
285 { "~N", 209 },
286 { "`O", 210 },
287 { "'O", 211 },
288 { "^O", 212 },
289 { "~O", 213 },
290 { ":O", 214 },
291 { "tmu", 215 },
292 { "/O", 216 },
293 { "`U", 217 },
294 { "'U", 218 },
295 { "^U", 219 },
296 { ":U", 220 },
297 { "'Y", 221 },
298 { "TP", 222 },
299 { "ss", 223 },
300 { "`a", 224 },
301 { "'a", 225 },
302 { "^a", 226 },
303 { "~a", 227 },
304 { ":a", 228 },
305 { "oa", 229 },
306 { "ae", 230 },
307 { ",c", 231 },
308 { "`e", 232 },
309 { "'e", 233 },
310 { "^e", 234 },
311 { ":e", 235 },
312 { "`i", 236 },
313 { "'i", 237 },
314 { "^i", 238 },
315 { ":i", 239 },
316 { "Sd", 240 },
317 { "~n", 241 },
318 { "`o", 242 },
319 { "'o", 243 },
320 { "^o", 244 },
321 { "~o", 245 },
322 { ":o", 246 },
323 { "tdi", 247 },
324 { "/o", 248 },
325 { "`u", 249 },
326 { "'u", 250 },
327 { "^u", 251 },
328 { ":u", 252 },
329 { "'y", 253 },
330 { "Tp", 254 },
331 { ":y", 255 },
332 { "hy", '-' },
333 { "oq", '`' },
334 { "cq", '\'' },
335 { "lq", 8220 }, // left smart quotes
336 { "rq", 8221 }, // right smart quotes
337 { "en", 8211 }, // en-dash
338 { "em", 8212 }, // em-dash
339 { "la", 10216 }, // left angle bracket
340 { "ra", 10217 }, // left angle bracket
341 { "rs", '\\' },
342 { "<=", 8804 }, // < or equal to sign
343 { ">=", 8805 }, // > or equal to sign
344 { "aq", '\'' },
345 { "tm", 8482 }, // trademark symbol
346 { NULL, 0 }
347 };
348
349 /******************************************************************************/
350
351 YelpManParser *
yelp_man_parser_new(void)352 yelp_man_parser_new (void)
353 {
354 YelpManParser *parser = g_new0 (YelpManParser, 1);
355 parser->accumulator = g_string_sized_new (1024);
356 return parser;
357 }
358
359 /*
360 This function is responsible for taking a path to a man file and
361 returning something in the groff intermediate output format for us
362 to use.
363
364 If something goes wrong, we return NULL and set error to be a
365 YelpError describing the problem.
366 */
367 static GInputStream*
get_troff(gchar * path,GError ** error)368 get_troff (gchar *path, GError **error)
369 {
370 gint ystdout;
371 GError *err = NULL;
372 const gchar *argv[] = { "/usr/local/libexec", "/yelp-groff", path, NULL };
373 gchar **my_argv;
374
375 /* g_strdupv() should accept a "const gchar **". */
376 my_argv = g_strdupv ((gchar **) argv);
377
378 if (!g_spawn_async_with_pipes (NULL, my_argv, NULL,
379 G_SPAWN_SEARCH_PATH, NULL, NULL,
380 NULL, NULL, &ystdout, NULL, &err)) {
381 /* We failed to run the man program. Return a "Huh?" error. */
382 *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN,
383 "%s", err->message);
384 g_error_free (err);
385 g_strfreev (my_argv);
386 return NULL;
387 }
388
389 g_strfreev (my_argv);
390
391 return (GInputStream*) g_unix_input_stream_new (ystdout, TRUE);
392 }
393
394 xmlDocPtr
yelp_man_parser_parse_file(YelpManParser * parser,gchar * path,GError ** error)395 yelp_man_parser_parse_file (YelpManParser *parser,
396 gchar *path,
397 GError **error)
398 {
399 GInputStream *troff_stream;
400 gboolean ret;
401 xmlNodePtr root;
402
403 troff_stream = get_troff (path, error);
404 if (!troff_stream) return NULL;
405
406 parser->stream = g_data_input_stream_new (troff_stream);
407
408 parser->doc = xmlNewDoc (BAD_CAST "1.0");
409 root = xmlNewNode (NULL, BAD_CAST "Man");
410 xmlDocSetRootElement (parser->doc, root);
411
412 parser->header = xmlNewNode (NULL, BAD_CAST "header");
413 xmlAddChild (root, parser->header);
414
415 while (1) {
416 parser->buffer =
417 g_data_input_stream_read_line (parser->stream,
418 &(parser->length),
419 NULL, NULL);
420 if (parser->buffer == NULL) break;
421
422 parser->line_no++;
423 ret = parser_parse_line (parser, error);
424
425 g_free (parser->buffer);
426
427 if (!ret) {
428 xmlFreeDoc (parser->doc);
429 parser->doc = NULL;
430 break;
431 }
432 }
433
434 cleanup_parsed_page (parser);
435
436 g_object_unref (parser->stream);
437
438 return parser->doc;
439 }
440
441 void
yelp_man_parser_free(YelpManParser * parser)442 yelp_man_parser_free (YelpManParser *parser)
443 {
444 guint k;
445
446 if (parser == NULL)
447 return;
448
449 for (k=0; k<MAN_FONTS; k++)
450 g_free (parser->font_registers[k]);
451 g_string_free (parser->accumulator, TRUE);
452 g_free (parser->title_str);
453 g_free (parser->section);
454 g_free (parser);
455 }
456
457 /******************************************************************************/
458
459 /* Sets the k'th font register to be name. Copies name, so free it
460 * afterwards. k should be in [0,MAN_FONTS). It seems that man always
461 * gives us ones at least 1, but groff_out(5) says non-negative.
462 */
463 static void
set_font_register(YelpManParser * parser,guint k,const gchar * name)464 set_font_register (YelpManParser *parser, guint k, const gchar* name)
465 {
466 if (k >= MAN_FONTS) {
467 g_warning ("Tried to set nonexistant font register %u to %s",
468 k, name);
469 return;
470 }
471 g_free (parser->font_registers[k]);
472 parser->font_registers[k] = g_strdup (name);
473 }
474
475 static const gchar*
get_font(const YelpManParser * parser)476 get_font (const YelpManParser *parser)
477 {
478 guint k = parser->current_font;
479 if (k >= MAN_FONTS ||
480 parser->font_registers[k] == NULL) {
481
482 g_warning ("Tried to get nonexistant font register %u", k);
483
484 return "";
485 }
486
487 return parser->font_registers[k];
488 }
489
490 /******************************************************************************/
491
492 /*
493 Convenience macros to scan a string, checking for the correct number
494 of things read.
495
496 Also to raise an error. Add an %s to the end of the format string,
497 which automatically gets given parser->buffer.
498 */
499 #define SSCANF(fmt,num,...) \
500 (sscanf (parser->buffer, (fmt), __VA_ARGS__) != (num))
501
502 #define PARSE_ERROR(...) \
503 g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, \
504 __VA_ARGS__, parser->buffer)
505 #define RAISE_PARSE_ERROR(...) \
506 { *error = PARSE_ERROR (__VA_ARGS__); return FALSE; }
507
508 static gboolean
parser_parse_line(YelpManParser * parser,GError ** error)509 parser_parse_line (YelpManParser *parser, GError **error)
510 {
511 const struct LineParsePair *p;
512
513 if (parser->line_no <= 3)
514 return parse_prologue_line (parser, error);
515
516 p = line_parsers;
517 while (p->handler != NULL) {
518 if (g_str_has_prefix (parser->buffer, p->prefix)) {
519 return p->handler(parser, error);
520 }
521 p++;
522 }
523 return TRUE;
524 }
525
526 static gboolean
parse_prologue_line(YelpManParser * parser,GError ** error)527 parse_prologue_line (YelpManParser *parser, GError **error)
528 {
529 if (parser->line_no != 2) return TRUE;
530
531 /* This is the interesting line, which should look like
532 x res 240 24 40
533 The interesting bits are the 24 and the 40, which are the
534 width and height of a character as far as -Tutf8 is
535 concerned.
536 */
537 if (SSCANF ("x %*s %*u %u %u", 2,
538 &parser->char_width, &parser->char_height)) {
539 RAISE_PARSE_ERROR ("Wrong 'x res' line from troff: %s");
540 }
541
542 return TRUE;
543 }
544
545 static gboolean
parse_xf(YelpManParser * parser,GError ** error)546 parse_xf (YelpManParser *parser, GError **error)
547 {
548 gchar name[11];
549 guint k;
550
551 if (SSCANF ("x f%*s %u %10s", 2, &k, name)) {
552 RAISE_PARSE_ERROR ("Invalid 'x f' line from troff: %s");
553 }
554 set_font_register (parser, k, name);
555 return TRUE;
556 }
557
558 static gboolean
parse_f(YelpManParser * parser,GError ** error)559 parse_f (YelpManParser *parser, GError **error)
560 {
561 guint k;
562 if (SSCANF ("f%u", 1, &k)) {
563 RAISE_PARSE_ERROR ("Invalid font line from troff: %s");
564 }
565 finish_span (parser);
566
567 parser->current_font = k;
568
569 return TRUE;
570 }
571
572 static gboolean
parse_v(YelpManParser * parser,GError ** error)573 parse_v (YelpManParser *parser, GError **error)
574 {
575 guint dy;
576 if (SSCANF ("v%u", 1, &dy)) {
577 RAISE_PARSE_ERROR ("Invalid v line from troff: %s");
578 }
579 parser->last_vertical_jump += dy;
580 parser->vpos += dy;
581 return TRUE;
582 }
583
584 static gboolean
parse_h(YelpManParser * parser,GError ** error)585 parse_h (YelpManParser *parser, GError **error)
586 {
587 guint dx;
588 int k;
589
590 if (SSCANF ("h%u", 1, &dx)) {
591 RAISE_PARSE_ERROR ("Invalid h line from troff: %s");
592 }
593 parser->hpos += dx;
594
595 /* This is a bit hackish to be honest but... if we're in something
596 * that'll end up in a span, a spacing h command means that a gap
597 * should appear. It seems that the easiest way to get this is to
598 * insert nonbreaking spaces (eugh!)
599 *
600 * Of course we don't want to do this when chained from wh24 or
601 * whatever, so use the last_char_was_space flag
602 * but... unfortunately some documents actually use stuff like
603 * wh96 for spacing (eg the lists in perl(1)). So (very hackish!),
604 * ignore double spaces, since that's probably just been put in to
605 * make the text justified (eugh), but allow bigger jumps.
606 *
607 * Incidentally, the perl manual here has bizarre gaps in the
608 * synopsis section. God knows why, but man displays them too so
609 * it's not our fault! :-)
610 */
611 k = dx_to_em_count (parser, dx);
612
613 if ((!parser->last_char_was_space) || (k > 2)) {
614
615 k -= parser->N_count;
616 if (k < 0) k = 0;
617
618 append_nbsps (parser, k);
619 }
620
621 parser->N_count = 0;
622
623 return TRUE;
624 }
625
626 static gboolean
parse_V(YelpManParser * parser,GError ** error)627 parse_V (YelpManParser *parser, GError **error)
628 {
629 guint y;
630 if (SSCANF ("V%u", 1, &y)) {
631 RAISE_PARSE_ERROR ("Invalid V line from troff: %s");
632 }
633 parser->last_vertical_jump += y - parser->vpos;
634 parser->vpos = y;
635 return TRUE;
636 }
637
638 static gboolean
parse_H(YelpManParser * parser,GError ** error)639 parse_H (YelpManParser *parser, GError **error)
640 {
641 guint x;
642 if (SSCANF ("H%u", 1, &x)) {
643 RAISE_PARSE_ERROR ("Invalid H line from troff: %s");
644 }
645 parser->hpos = x;
646 return TRUE;
647 }
648
649 static gboolean
parse_text(YelpManParser * parser,GError ** error)650 parse_text (YelpManParser *parser, GError **error)
651 {
652 gchar *text, *section, *tmp;
653 const gchar *acc;
654
655 /*
656 Sneakily, this might get called with something other than t
657 starting the buffer: see parse_C and parse_N.
658 */
659 if (parser->buffer[0] == 't') {
660 parser->N_count = 0;
661 }
662
663 if (parser->state == START) {
664 /* This should be the 'Title String(1)' line. It might come in
665 * chunks (for example, it might be more than one line
666 * long!). So just read bits until we get a (blah) bit: stick
667 * everything in the accumulator and check for
668 * parentheses. When we've got some, stick the parsed title in
669 * the header and switch to HAVE_TITLE.
670 *
671 * The parse_n code will error out if we didn't manage to get
672 * a title before the first newline and otherwise is in charge
673 * of switching to body-parsing mode.
674 */
675 g_string_append (parser->accumulator, parser->buffer+1);
676
677 acc = parser->accumulator->str;
678
679 section = strchr (acc, '(');
680
681 if (section) {
682 section++;
683 tmp = strchr (section, ')');
684 }
685
686 if (section && tmp) {
687 /* We've got 'Blah (3)' or the like in the accumulator */
688 if (*(tmp+1) != '\0') {
689 RAISE_PARSE_ERROR ("Don't understand title line: '%s'");
690 }
691 parser->state = HAVE_TITLE;
692 parser->title_str = g_strdup (acc);
693
694 text = g_strndup (acc, (section - 1) - acc);
695 section = g_strndup (section, tmp - section);
696
697 register_title (parser, text, section);
698
699 g_string_truncate (parser->accumulator, 0);
700
701 g_free (text);
702 parser->section = section;
703 }
704
705 return TRUE;
706 }
707
708 if (parser->state == BODY)
709 return parse_body_text (parser, error);
710
711 /* In state HAVE_TITLE */
712 else {
713 /* We expect (maybe!) to get some lines in between the two
714 * occurrences of the title itself. So collect up all the text
715 * we get and then we'll remove the copy of the title at the
716 * end (hopefully) when we find a newline in parse_n.
717 */
718 g_string_append (parser->accumulator, parser->buffer+1);
719 return TRUE;
720 }
721 }
722
723 static gboolean
parse_body_text(YelpManParser * parser,GError ** error)724 parse_body_text (YelpManParser *parser, GError **error)
725 {
726 /*
727 It's this function which is responsible for trying to get *some*
728 semantic information back out of the manual page.
729
730 The highest-level chopping up is into sections. We use the
731 heuristic that if either
732 (1) We haven't got a section yet or
733 (2) text starts a line (hpos=0)
734 then it's a section title.
735
736 It's possible to have spaces in section titles, so we carry on
737 accumulating the section title until the next newline.
738 */
739 if (parser->section_state == SECTION_BODY &&
740 (!parser->section_node || (parser->hpos == 0))) {
741 g_string_truncate (parser->accumulator, 0);
742 /* End the current sheet & section */
743 parser->section_state = SECTION_TITLE;
744 parser->sheet_node = NULL;
745
746 parser->section_node =
747 xmlAddChild (xmlDocGetRootElement (parser->doc),
748 xmlNewNode (NULL, BAD_CAST "section"));
749 }
750
751 if (parser->section_state != SECTION_TITLE) {
752 deal_with_newlines (parser);
753 }
754
755 g_string_append (parser->accumulator, parser->buffer+1);
756
757 /* Move hpos forward per char */
758 parser->hpos += strlen (parser->buffer+1) * parser->char_width;
759
760 parser->last_char_was_space = FALSE;
761
762 return TRUE;
763 }
764
765 /*
766 w is a sort of prefix argument. It indicates a space, so we register
767 that here, then call parser_parse_line again on the rest of the
768 string to deal with that.
769 */
770 static gboolean
parse_w(YelpManParser * parser,GError ** error)771 parse_w (YelpManParser *parser, GError **error)
772 {
773 gboolean ret;
774
775 if (parser->state != START) {
776 g_string_append_c (parser->accumulator, ' ');
777 }
778
779 parser->buffer++;
780 parser->last_char_was_space = TRUE;
781
782 ret = parser_parse_line (parser, error);
783
784 parser->buffer--;
785 return ret;
786 }
787
788 static gboolean
parse_n(YelpManParser * parser,GError ** error)789 parse_n (YelpManParser *parser, GError **error)
790 {
791 xmlNodePtr node;
792
793 /* When we're in the header, the parse_n is responsible for
794 * switching to body text. (See the body of parse_text() for more
795 * of an explanation).
796 */
797 if (parser->state == START) {
798 /* Oh no! We've not got a proper title yet! Ho hum, let's
799 stick whatever's going into a 'title title' and have a null
800 section. Sob.
801 */
802 register_title (parser,
803 parser->accumulator->str,
804 "unknown section");
805 g_string_truncate (parser->accumulator, 0);
806 parser->state = BODY;
807 return TRUE;
808 }
809
810 if (parser->state == HAVE_TITLE) {
811 /* What we've got so far is the manual's collection, followed
812 by the title again. So we want to get rid of the latter if
813 possible...
814 */
815 right_truncate_common (parser->accumulator->str,
816 parser->title_str);
817 unicode_strstrip (parser->accumulator->str);
818
819 xmlNewTextChild (parser->header,
820 NULL, BAD_CAST "collection",
821 BAD_CAST parser->accumulator->str);
822 g_string_truncate (parser->accumulator, 0);
823 parser->state = BODY;
824 parser->section_state = SECTION_BODY;
825 return TRUE;
826 }
827
828 /* parser->state == BODY */
829 if (parser->section_state == SECTION_TITLE) {
830
831 g_strchomp (parser->accumulator->str);
832 xmlNewTextChild (parser->section_node, NULL,
833 BAD_CAST "title",
834 BAD_CAST parser->accumulator->str);
835 g_string_truncate (parser->accumulator, 0);
836
837 parser->section_state = SECTION_BODY;
838 }
839 else if (parser->sheet_node != NULL) {
840 /*
841 In the body of a section, when we get to a newline we should
842 have an accumulator with text in it and a non-null sheet
843 (hopefully!).
844
845 We know the current font, so add a span for that font
846 containing the relevant text. Then add a <br/> tag.
847 */
848 finish_span (parser);
849 node = xmlNewNode (NULL, BAD_CAST "br");
850 xmlAddChild (parser->sheet_node, node);
851 }
852
853 parser->newline = TRUE;
854 parser->last_char_was_space = FALSE;
855
856 return TRUE;
857 }
858
859 static void
finish_span(YelpManParser * parser)860 finish_span (YelpManParser *parser)
861 {
862 xmlNodePtr node;
863
864 if (parser->accumulator->str[0] != '\0') {
865 node = xmlNewTextChild (parser->sheet_node, NULL,
866 BAD_CAST "span",
867 BAD_CAST parser->accumulator->str);
868 xmlNewProp (node, BAD_CAST "class",
869 BAD_CAST get_font (parser));
870 g_string_truncate (parser->accumulator, 0);
871 }
872 }
873
874 static guint
dx_to_em_count(YelpManParser * parser,guint dx)875 dx_to_em_count (YelpManParser *parser, guint dx)
876 {
877 return (int)(dx / ((float)parser->char_width));
878 }
879
880 static gboolean
parse_N(YelpManParser * parser,GError ** error)881 parse_N (YelpManParser *parser, GError **error)
882 {
883 gint n;
884 gchar tmp[2];
885
886 if (SSCANF ("N%i", 1, &n)) {
887 RAISE_PARSE_ERROR ("Strange format for N line: %s");
888 }
889 if (n > 127) {
890 RAISE_PARSE_ERROR ("N line has non-7-bit character: %s");
891 }
892 if (n < -200) {
893 RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s");
894 }
895
896 if (n < 0) {
897 append_nbsps (parser, -n);
898 parser->N_count += -n;
899 return TRUE;
900 }
901
902 parser->N_count++;
903
904 tmp[0] = (gchar)n;
905 tmp[1] = '\0';
906
907 return cheeky_call_parse_line (parser, error, 'N', tmp);
908 }
909
910 static void
append_nbsps(YelpManParser * parser,guint k)911 append_nbsps (YelpManParser *parser, guint k)
912 {
913 for (; k > 0; k--) {
914 /* 0xc2 0xa0 is nonbreaking space in utf8 */
915 g_string_append_c (parser->accumulator, 0xc2);
916 g_string_append_c (parser->accumulator, 0xa0);
917 }
918 }
919
920 static gboolean
parse_C(YelpManParser * parser,GError ** error)921 parse_C (YelpManParser *parser, GError **error)
922 {
923 gchar name[17];
924 gunichar code = 0;
925 guint k;
926 gint len;
927
928 if (SSCANF ("C%16s", 1, name)) {
929 RAISE_PARSE_ERROR ("Can't understand special character: %s");
930 }
931
932 for (k=0; char_translations[k].from; k++) {
933 if (g_str_equal (char_translations[k].from, name)) {
934 code = char_translations[k].to;
935 break;
936 }
937 }
938 if (sscanf (name, "u%x", &k) == 1) {
939 code = k;
940 }
941
942 if (!code) {
943 g_warning ("Couldn't parse troff special character: '%s'",
944 name);
945 code = 65533; /* Unicode replacement character */
946 }
947
948 /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */
949 len = g_unichar_to_utf8 (code, name);
950 name[len] = '\0';
951
952 parser->N_count++;
953
954 return cheeky_call_parse_line (parser, error, 'C', name);
955 }
956
957 static void
deal_with_newlines(YelpManParser * parser)958 deal_with_newlines (YelpManParser *parser)
959 {
960 /*
961 If newline is true, this is the first word on a line.
962
963 In which case, we check to see whether hpos agrees with the
964 current sheet's indent. If so (or if there isn't a sheet yet!),
965 we just add to the accumulator. If not, start a new sheet with
966 the correct indent.
967
968 If we aren't the first word on the line, just add to the
969 accumulator.
970 */
971 gchar tmp[64];
972 guint jump_lines;
973 gboolean made_sheet = FALSE, dont_jump = FALSE;
974
975 /* This only happens at the start of a section, where there's
976 already a gap
977 */
978 if (!parser->sheet_node) {
979 dont_jump = TRUE;
980 }
981
982 if ((!parser->sheet_node) ||
983 (parser->newline && (parser->hpos != parser->sheet_indent))) {
984 new_sheet (parser);
985 made_sheet = TRUE;
986 }
987
988 if (parser->newline) {
989 if ((parser->last_vertical_jump > 0) && (!dont_jump)) {
990 jump_lines =
991 parser->last_vertical_jump/parser->char_height;
992 } else {
993 jump_lines = 1;
994 }
995
996 if (jump_lines > 1) {
997 if (!made_sheet) new_sheet (parser);
998 made_sheet = TRUE;
999 }
1000
1001 snprintf (tmp, 64, "%u", dx_to_em_count (parser, parser->hpos));
1002 xmlNewProp (parser->sheet_node,
1003 BAD_CAST "indent", BAD_CAST tmp);
1004
1005 if (made_sheet) {
1006 snprintf (tmp, 64, "%u", jump_lines-1);
1007 xmlNewProp (parser->sheet_node,
1008 BAD_CAST "jump", BAD_CAST tmp);
1009 }
1010 }
1011
1012 parser->newline = FALSE;
1013 parser->last_vertical_jump = 0;
1014 }
1015
1016 static gboolean
parse_p(YelpManParser * parser,GError ** error)1017 parse_p (YelpManParser *parser, GError **error)
1018 {
1019 parser->vpos = 0;
1020 parser->hpos = 0;
1021 return TRUE;
1022 }
1023
1024 static void
new_sheet(YelpManParser * parser)1025 new_sheet (YelpManParser *parser)
1026 {
1027 /* We don't need to worry about finishing the current sheet,
1028 since the accumulator etc. get cleared on newlines and we
1029 know we're at the start of a line.
1030 */
1031 parser->sheet_node =
1032 xmlAddChild (parser->section_node,
1033 xmlNewNode (NULL, BAD_CAST "sheet"));
1034 parser->sheet_indent = parser->hpos;
1035 }
1036
1037 static void
register_title(YelpManParser * parser,const gchar * name,const gchar * section)1038 register_title (YelpManParser *parser,
1039 const gchar* name, const gchar* section)
1040 {
1041 xmlNewTextChild (parser->header,
1042 NULL, BAD_CAST "title", BAD_CAST name);
1043 xmlNewTextChild (parser->header,
1044 NULL, BAD_CAST "section", BAD_CAST section);
1045 }
1046
1047 static void
right_truncate_common(gchar * dst,const gchar * src)1048 right_truncate_common (gchar *dst, const gchar *src)
1049 {
1050 guint len_src = strlen (src);
1051 guint len_dst = strlen (dst);
1052
1053 guint k = (len_src < len_dst) ? len_src - 1 : len_dst - 1;
1054
1055 dst += len_dst - 1;
1056 src += len_src - 1;
1057
1058 while (k > 0) {
1059 if (*dst != *src) break;
1060 *dst = '\0';
1061
1062 k--;
1063 dst--;
1064 src--;
1065 }
1066 }
1067
1068 static gboolean
cheeky_call_parse_line(YelpManParser * parser,GError ** error,gchar first_char,const gchar * text)1069 cheeky_call_parse_line (YelpManParser *parser, GError **error,
1070 gchar first_char, const gchar* text)
1071 {
1072 /* Do a cunning trick. There's all sorts of code that parse_text
1073 * does, which we don't want to duplicate in parse_N and
1074 * parse_C. So feed a buffer back to parse_text. Tada! Start it
1075 * with "C" or "N" rather than "t" so clever stuff in parse_text
1076 * can tell the difference.
1077 */
1078 gchar *tmp;
1079 gboolean ret;
1080 guint len = strlen (text);
1081
1082 tmp = parser->buffer;
1083 parser->buffer = g_new (gchar, 2 + len);
1084 parser->buffer[0] = first_char;
1085 strncpy (parser->buffer + 1, text, len + 1);
1086
1087 ret = parse_text (parser, error);
1088
1089 g_free (parser->buffer);
1090 parser->buffer = tmp;
1091
1092 return ret;
1093 }
1094
1095 static void
cleanup_parsed_page(YelpManParser * parser)1096 cleanup_parsed_page (YelpManParser *parser)
1097 {
1098 /* First job: the last line usually has the version, date and
1099 * title (again!). The code above misunderstands and parses this
1100 * as a section, so we need to "undo" this and stick the data in
1101 * the header where it belongs.
1102 *
1103 * parser->section_node should still point to it. We assume this
1104 * has happened if it has exactly one child element (the <title>
1105 * tag)
1106 */
1107 gchar *lastline;
1108 GRegex *regex;
1109 gchar regex_string [1024];
1110
1111 if (xmlChildElementCount (parser->section_node) == 1) {
1112 lastline = (gchar *)xmlNodeGetContent (parser->section_node);
1113
1114 /* If parse_last_line works, it sets the data from it in the
1115 <header> tag, so delete the final section. */
1116 if (parse_last_line (parser, lastline)) {
1117 xmlUnlinkNode (parser->section_node);
1118 xmlFreeNode (parser->section_node);
1119 }
1120 else {
1121 /* Oh dear. This would be unexpected and doesn't seem to
1122 happen with man on my system. But we probably shouldn't
1123 ditch the info, so let's leave the <section> tag and
1124 print a warning message to the console.
1125 */
1126 g_warning ("Unexpected final line in man document (%s)\n",
1127 lastline);
1128 }
1129
1130 xmlFree (lastline);
1131 }
1132
1133 /* Next job: Go through and stick the links in. Text that looks
1134 * like man(1) should be converted to a link to man:man(1) and
1135 * urls should also be linkified.
1136 *
1137 * Unfortunately, it's not entirely clear what constitutes a valid
1138 * section. All sections must be alphanumeric and the logic we use
1139 * to avoid extra hits (eg "one or more widget(s)") is that either
1140 * the section must start with a digit or (if the current section
1141 * doesn't) must start with the same letter as the current
1142 * section.
1143 */
1144 snprintf (regex_string, 1024,
1145 "([a-zA-Z0-9\\-_.:]+)\\(((%c|[0-9])[a-zA-Z0-9]*)\\)",
1146 parser->section ? parser->section[0] : '0');
1147 regex = g_regex_new (regex_string, 0, 0, NULL);
1148 g_return_if_fail (regex);
1149 fixup_links (parser, regex, man_link_inserter);
1150 g_regex_unref (regex);
1151
1152 /* Now for http:// links.
1153 */
1154 regex = g_regex_new ("https?:\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+"
1155 "([\\w\\-\\.,@?^=%&:/~\\+#]*"
1156 "[\\w\\-\\@?^=%&/~\\+#])?",
1157 0, 0, NULL);
1158 g_return_if_fail (regex);
1159 fixup_links (parser, regex, http_link_inserter);
1160 g_regex_unref (regex);
1161 }
1162
1163 static gchar *
skip_whitespace(gchar * text)1164 skip_whitespace (gchar *text)
1165 {
1166 while (g_unichar_isspace (g_utf8_get_char (text))) {
1167 text = g_utf8_next_char (text);
1168 }
1169 return text;
1170 }
1171
1172 static gchar *
last_non_whitespace(gchar * text)1173 last_non_whitespace (gchar *text)
1174 {
1175 gchar *end = text + strlen(text);
1176 gchar *prev;
1177
1178 prev = g_utf8_find_prev_char (text, end);
1179 if (!prev) {
1180 /* The string must have been zero-length. */
1181 return NULL;
1182 }
1183
1184 while (g_unichar_isspace (g_utf8_get_char (prev))) {
1185 end = prev;
1186 prev = g_utf8_find_prev_char (text, prev);
1187 if (!prev) return NULL;
1188 }
1189 return end;
1190 }
1191
1192 static gchar *
find_contiguous_whitespace(gchar * text,guint ws_len)1193 find_contiguous_whitespace (gchar *text, guint ws_len)
1194 {
1195 guint counter = 0;
1196 gchar *ws_start = NULL;
1197 while (*text) {
1198 if (g_unichar_isspace (g_utf8_get_char (text))) {
1199 if (!counter) ws_start = text;
1200 counter++;
1201 }
1202 else counter = 0;
1203
1204 if (counter == ws_len) return ws_start;
1205
1206 text = g_utf8_next_char (text);
1207 }
1208 return NULL;
1209 }
1210
1211 static gboolean
parse_last_line(YelpManParser * parser,gchar * line)1212 parse_last_line (YelpManParser *parser, gchar* line)
1213 {
1214 /* We expect a line of the form
1215 '1.2.3 blah 2009 libfoo(1)'
1216 where the spaces are all nbsp's.
1217
1218 Look for a gap of at least 3 in a row. If we find that, expand
1219 either side and declare the stuff before to be the version
1220 number and then the stuff afterwards to be the start of the
1221 date. Then do the same thing on the next gap, if there is one.
1222 */
1223 gchar *gap, *date_start;
1224
1225 gchar *version;
1226 gchar *date;
1227
1228 gap = find_contiguous_whitespace (line, 3);
1229 if (!gap) return FALSE;
1230
1231 version = g_strndup (line, gap - line);
1232
1233 date_start = skip_whitespace (gap);
1234
1235 gap = find_contiguous_whitespace (date_start, 3);
1236 if (!gap) return FALSE;
1237
1238 date = g_strndup (date_start, gap - date_start);
1239
1240 xmlNewProp (parser->header, BAD_CAST "version", BAD_CAST version);
1241 xmlNewProp (parser->header, BAD_CAST "date", BAD_CAST date);
1242
1243 g_free (version);
1244 g_free (date);
1245
1246 return TRUE;
1247 }
1248
1249 /* This should work like g_strstrip, but that's an ASCII-only version
1250 * and I want to strip the nbsp's that I so thoughtfully plaster
1251 * stuff with...
1252 */
1253 static void
unicode_strstrip(gchar * str)1254 unicode_strstrip (gchar *str)
1255 {
1256 gchar *start, *end;
1257
1258 if (str == NULL) return;
1259
1260 end = last_non_whitespace (str);
1261
1262 if (!end) {
1263 /* String is zero-length or entirely whitespace */
1264 *str = '\0';
1265 return;
1266 }
1267 start = skip_whitespace (str);
1268
1269 memmove (str, start, end - start);
1270 *(str + (end - start)) = '\0';
1271 }
1272
1273 static void
sheet_fixup_links(xmlNodePtr sheet,const GRegex * regex,link_inserter inserter)1274 sheet_fixup_links (xmlNodePtr sheet,
1275 const GRegex *regex, link_inserter inserter)
1276 {
1277 /*
1278 This works as follows: grab (<span>) nodes from a sheet in
1279 order and stick their contents into a string. Since a sheet
1280 won't be ludicrously long, we can just grab everything and then
1281 work over it, but we need to keep track of which node points at
1282 which bit of the string so we can call inserter helpfully. To do
1283 so, use byte offsets, since that seems less likely to go
1284 horribly wrong!
1285 */
1286 GString *accumulator = g_string_new ("");
1287 xmlNodePtr span;
1288 xmlChar *tmp;
1289 gsize offset = 0;
1290 gsize len;
1291 offset_elt_pair pair;
1292 GMatchInfo *match_info;
1293
1294 /* Make pairs zero-terminated so that code can iterate through it
1295 * looking for something with elt = NULL. */
1296 GArray *pairs = g_array_new (TRUE, FALSE,
1297 sizeof (offset_elt_pair));
1298
1299 g_return_if_fail (regex);
1300 g_return_if_fail (inserter);
1301 g_return_if_fail (sheet);
1302
1303 for (span = sheet->children; span != NULL; span = span->next) {
1304 if (span->type != XML_ELEMENT_NODE) continue;
1305
1306 if (strcmp ((const char*) span->name, "span") != 0) {
1307
1308 if (strcmp ((const char*) span->name, "a") == 0)
1309 continue;
1310
1311 if (strcmp ((const char*) span->name, "br") == 0) {
1312 /* If the last character in the accumulator is a
1313 * hyphen, we don't want to include that in the link
1314 * we make. If not, append a newline to the
1315 * accumulator (so we don't mistakenly make links from
1316 * "see\nthis(2)" to seethis(2).
1317 *
1318 * Either way, we add the <br> to the list of pairs
1319 * since we might need to do stuff with it if it's in
1320 * the middle of a link.
1321 */
1322 len = strlen (accumulator->str);
1323 if (len > 0 && accumulator->str [len-1] == '-') {
1324 g_string_truncate (accumulator, len - 1);
1325 offset--;
1326 }
1327 else {
1328 g_string_append_c (accumulator, '\n');
1329 offset++;
1330 }
1331 pair.start = offset;
1332 pair.end = offset;
1333 pair.elt = span; /* Er, br in fact. */
1334 g_array_append_val (pairs, pair);
1335
1336 continue;
1337 }
1338
1339 g_warning ("Expected all child elements to be "
1340 "<span>, <br> or <a>, but "
1341 "have found a <%s>.",
1342 (gchar *) span->name);
1343 continue;
1344 }
1345
1346 tmp = xmlNodeGetContent (span);
1347 g_string_append (accumulator, (gchar *) tmp);
1348 len = strlen ((const char*) tmp);
1349
1350 pair.start = offset;
1351 pair.end = offset + len;
1352 pair.elt = span;
1353
1354 g_array_append_val (pairs, pair);
1355
1356 offset += len;
1357 xmlFree (tmp);
1358 }
1359
1360 /* We've got the data. Now try to match the regex against it as
1361 * many times as possible
1362 */
1363 offset = 0;
1364 g_regex_match_full (regex, accumulator->str,
1365 -1, offset, 0, &match_info, NULL);
1366 while (g_match_info_matches (match_info)) {
1367 offset = inserter ((offset_elt_pair *)pairs->data,
1368 match_info);
1369
1370 g_match_info_free (match_info);
1371
1372 g_regex_match_full (regex, accumulator->str,
1373 -1, offset, 0, &match_info, NULL);
1374 }
1375
1376 g_string_free (accumulator, TRUE);
1377 g_array_unref (pairs);
1378 }
1379
1380 static void
fixup_links(YelpManParser * parser,const GRegex * regex,link_inserter inserter)1381 fixup_links (YelpManParser *parser,
1382 const GRegex *regex, link_inserter inserter)
1383 {
1384 /* Iterate over all the <sheet>'s in the xml document */
1385 xmlXPathContextPtr context;
1386 xmlXPathObjectPtr path_obj;
1387 xmlNodeSetPtr nodeset;
1388 gint i;
1389
1390 context = xmlXPathNewContext (parser->doc);
1391 g_return_if_fail (context);
1392
1393 path_obj = xmlXPathEvalExpression (BAD_CAST "//sheet", context);
1394 g_return_if_fail (path_obj);
1395
1396 nodeset = path_obj->nodesetval;
1397 g_return_if_fail (nodeset);
1398
1399 for (i = 0; i < nodeset->nodeNr; ++i) {
1400 sheet_fixup_links (nodeset->nodeTab[i], regex, inserter);
1401 }
1402
1403 xmlXPathFreeObject (path_obj);
1404 xmlXPathFreeContext (context);
1405 }
1406
1407 /*
1408 This inserts new_child under parent. If older_sibling is non-NULL,
1409 we stick it immediately after it. Otherwise, insert as the first
1410 child of the parent.
1411
1412 Returns the inserted child.
1413 */
1414 static xmlNodePtr
insert_child_after(xmlNodePtr parent,xmlNodePtr older_sibling,xmlNodePtr new_child)1415 insert_child_after (xmlNodePtr parent, xmlNodePtr older_sibling,
1416 xmlNodePtr new_child)
1417 {
1418 g_return_val_if_fail (parent && new_child, new_child);
1419
1420 if (older_sibling) {
1421 xmlAddNextSibling (older_sibling, new_child);
1422 }
1423 else if (parent->children == NULL) {
1424 xmlAddChild (parent, new_child);
1425 }
1426 else {
1427 xmlAddPrevSibling (parent->children, new_child);
1428 }
1429
1430 return new_child;
1431 }
1432
1433 static void
copy_prop(xmlNodePtr to,xmlNodePtr from,const xmlChar * name)1434 copy_prop (xmlNodePtr to, xmlNodePtr from, const xmlChar *name)
1435 {
1436 xmlChar *prop = xmlGetProp (from, name);
1437 g_return_if_fail (prop);
1438 xmlSetProp (to, name, prop);
1439 xmlFree (prop);
1440 }
1441
1442 static gsize
do_node_replacement(xmlNodePtr anchor_node,offset_elt_pair * offsets,gsize startpos,gsize endpos)1443 do_node_replacement (xmlNodePtr anchor_node,
1444 offset_elt_pair *offsets,
1445 gsize startpos, gsize endpos)
1446 {
1447 xmlNodePtr node, sibling_before;
1448 gchar *gtmp;
1449 xmlChar *xtmp, *xshort;
1450 gsize look_from;
1451
1452 /* Find the first element by searching through offsets. I suppose
1453 * a binary search would be cleverer, but I doubt that this will
1454 * take significant amounts of time.
1455 *
1456 * We should never fall off the end, but (just in case) the GArray
1457 * that holds the offsets is zero-terminated and elt should never
1458 * be NULL so we can stop if necessary
1459 */
1460 while ((offsets->end <= startpos) && offsets->elt) {
1461 offsets++;
1462 }
1463 g_return_val_if_fail (offsets->elt, endpos);
1464
1465 /* xtmp is NULL by default, but we do this here so that if we read
1466 * the node in the if block below, we don't have to do it a second
1467 * time.
1468 */
1469 xtmp = NULL;
1470 sibling_before = offsets->elt->prev;
1471 look_from = startpos;
1472
1473 /* Maybe there's text in the relevant span before the start of
1474 * the stuff we want to replace with a link.
1475 */
1476 if (startpos > offsets->start) {
1477 node = xmlNewNode (NULL, BAD_CAST "span");
1478 copy_prop (node, offsets->elt, BAD_CAST "class");
1479
1480 xtmp = xmlNodeGetContent (offsets->elt);
1481 gtmp = g_strndup ((const gchar*)xtmp, startpos - offsets->start);
1482 xmlNodeAddContent (node, BAD_CAST gtmp);
1483 g_free (gtmp);
1484
1485 sibling_before = insert_child_after (offsets->elt->parent,
1486 sibling_before, node);
1487 }
1488
1489 insert_child_after (offsets->elt->parent,
1490 sibling_before, anchor_node);
1491
1492 /* The main loop. Here we work over each span that overlaps with
1493 * the link we're adding. We add a similar span as a child of the
1494 * anchor node and then delete the existing one. */
1495 while (look_from < endpos) {
1496 if (!xtmp) xtmp = xmlNodeGetContent (offsets->elt);
1497
1498 if (strcmp ((const gchar*)offsets->elt->name, "br") == 0) {
1499 node = xmlNewChild (anchor_node,
1500 NULL, BAD_CAST "br", NULL);
1501 xmlUnlinkNode (offsets->elt);
1502 xmlFreeNode (offsets->elt);
1503 xmlFree (xtmp);
1504 xtmp = NULL;
1505 offsets++;
1506 }
1507 else if (endpos < offsets->end) {
1508 xshort = BAD_CAST g_strndup ((const gchar*)xtmp,
1509 endpos - offsets->start);
1510
1511 node = xmlNewChild (anchor_node, NULL, BAD_CAST "span",
1512 xshort + (look_from-offsets->start));
1513 copy_prop (node, offsets->elt, BAD_CAST "class");
1514
1515 node = xmlNewNode (NULL, BAD_CAST "span");
1516 xmlNodeAddContent (node,
1517 xtmp + (endpos - offsets->start));
1518 copy_prop (node, offsets->elt, BAD_CAST "class");
1519 xmlAddNextSibling (anchor_node, node);
1520
1521 xmlFree (xshort);
1522
1523 xmlUnlinkNode (offsets->elt);
1524 xmlFreeNode (offsets->elt);
1525 xmlFree (xtmp);
1526 xtmp = NULL;
1527
1528 offsets->start = endpos;
1529 offsets->elt = node;
1530 }
1531 else {
1532 node = xmlNewChild (anchor_node, NULL, BAD_CAST "span",
1533 xtmp + (look_from - offsets->start));
1534 copy_prop (node, offsets->elt, BAD_CAST "class");
1535
1536 xmlUnlinkNode (offsets->elt);
1537 xmlFreeNode (offsets->elt);
1538 xmlFree (xtmp);
1539 xtmp = NULL;
1540 offsets++;
1541 }
1542
1543 if (!offsets->elt) {
1544 /* We got to the end of a sheet and of the stuff we're
1545 * doing at the same time
1546 */
1547 return endpos;
1548 }
1549
1550 look_from = offsets->start;
1551 }
1552
1553 return offsets->start;
1554 }
1555
1556 static gsize
do_link_insertion(const gchar * url,offset_elt_pair * offsets,gsize startpos,gsize endpos)1557 do_link_insertion (const gchar *url,
1558 offset_elt_pair *offsets,
1559 gsize startpos, gsize endpos)
1560 {
1561 xmlNodePtr anchor_node = xmlNewNode (NULL, BAD_CAST "a");
1562
1563 xmlNewProp (anchor_node, BAD_CAST "href", BAD_CAST url);
1564
1565 return do_node_replacement (anchor_node, offsets,
1566 startpos, endpos);
1567 }
1568
1569 static gsize
man_link_inserter(offset_elt_pair * offsets,const GMatchInfo * match_info)1570 man_link_inserter (offset_elt_pair *offsets,
1571 const GMatchInfo *match_info)
1572 {
1573 gchar *name, *section;
1574 gchar url[1024];
1575
1576 gint startpos, endpos;
1577
1578 g_match_info_fetch_pos (match_info, 0, &startpos, &endpos);
1579
1580 name = g_match_info_fetch (match_info, 1);
1581 section = g_match_info_fetch (match_info, 2);
1582
1583 g_return_val_if_fail (name && section, endpos);
1584
1585 snprintf (url, 1024, "man:%s(%s)", name, section);
1586
1587 g_free (name);
1588 g_free (section);
1589
1590 return do_link_insertion (url, offsets, startpos, endpos);
1591 }
1592
1593 static gsize
http_link_inserter(offset_elt_pair * offsets,const GMatchInfo * match_info)1594 http_link_inserter (offset_elt_pair *offsets,
1595 const GMatchInfo *match_info)
1596 {
1597 gchar *url;
1598 gint startpos, endpos;
1599 gsize ret;
1600
1601 url = g_match_info_fetch (match_info, 0);
1602 g_match_info_fetch_pos (match_info, 0, &startpos, &endpos);
1603
1604 ret = do_link_insertion (url, offsets, startpos, endpos);
1605
1606 g_free (url);
1607
1608 return ret;
1609 }
1610