1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 4 -*- */
2 /*
3  * Copyright (C) 2003-2010 Shaun McCance <shaunm@gnome.org>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation; either version 2 of the
8  * License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public
16  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
17  *
18  * Author: Shaun McCance <shaunm@gnome.org>
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24 
25 #include <glib.h>
26 #include <glib/gi18n.h>
27 #include <libxml/tree.h>
28 #include <libxml/xpath.h>
29 #include <gio/gio.h>
30 #include <gio/gunixinputstream.h>
31 #include <string.h>
32 #include <math.h>
33 
34 #include "yelp-error.h"
35 #include "yelp-man-parser.h"
36 
37 #define MAN_FONTS 8
38 
39 /* The format has two copies of the title like MAN(1) at the top,
40  * possibly with a string of text in between for the collection.
41  *
42  * Start with the parser on START, then HAVE_TITLE when we've read the
43  * first word with parentheses. At that point, stick new words into
44  * the "collection" tag. Then finally switch to BODY when we've seen
45  * the second copy of the one with parentheses.
46  */
47 typedef enum ManParserState
48 {
49     START,
50     HAVE_TITLE,
51     BODY
52 } ManParserState;
53 
54 /* See parse_body_text for how this is used. */
55 typedef enum ManParserSectionState
56 {
57     SECTION_TITLE,
58     SECTION_BODY
59 } ManParserSectionState;
60 
61 struct _YelpManParser {
62     xmlDocPtr     doc;           /* The top-level XML document */
63     xmlNodePtr    header;        /* The header node */
64     xmlNodePtr    section_node;  /* The current section */
65     xmlNodePtr    sheet_node;    /* The current sheet */
66 
67     GDataInputStream *stream;    /* The GIO input stream to read from */
68     gchar            *buffer;    /* The buffer, line at a time */
69     gsize             length;    /* The buffer length */
70 
71     gchar            *section;   /* The name of the current section */
72 
73     /* The width and height of a character according to troff. */
74     guint char_width;
75     guint char_height;
76 
77     /* Count the number of lines we've parsed (needed to get prologue) */
78     guint line_no;
79 
80     /* The x f k name command sets the k'th register to be name. */
81     gchar* font_registers[MAN_FONTS];
82 
83     /* The current font. Should be the index of one of the
84      * font_registers. Starts at 0 (of course!)
85      */
86     guint current_font;
87 
88     /* See description of ManParserState above */
89     ManParserState state;
90 
91     /* Vertical and horizontal position as far as the troff output is
92      * concerned. (Measured from top-left).
93      */
94     guint vpos, hpos;
95 
96     /* Text accumulator (needed since it comes through in dribs &
97      * drabs...) */
98     GString *accumulator;
99 
100     /* See parse_body_text for how this is used. */
101     ManParserSectionState section_state;
102 
103     /* The indent of the current sheet */
104     guint sheet_indent;
105 
106     /* Set to TRUE if there's been a newline since the last text was
107      * parsed. */
108     gboolean newline;
109 
110     /* Count the number of 'N' lines we've seen since the last h
111      * command. This is because for some reason N doesn't
112      * automatically move the position forward. Thus immediately after
113      * one, you see a h24 or the like. Unless there's a space. Then it
114      * might be wh48. This is set in parse_N (obviously) and used in
115      * parse_h.
116      */
117     guint N_count;
118 
119     /* Keep track of whether the last character was a space. We can't
120      * just do this by looking at the last char of accumulator,
121      * because if there's a font change, it gets zeroed. This gets set
122      * to TRUE by parse_w and is FALSE the rest of the time.
123      */
124     gboolean last_char_was_space;
125 
126     /* Keep track of the size of the last vertical jump - used to tell
127      * whether we need to insert extra space above a line.
128      */
129     gint last_vertical_jump;
130 
131     /* The title we read earlier (eg 'Foo(2)') */
132     gchar *title_str;
133 };
134 
135 static gboolean parser_parse_line (YelpManParser *parser, GError **error);
136 static gboolean parse_prologue_line (YelpManParser *parser, GError **error);
137 
138 /* Parsers for different types of line */
139 typedef gboolean (*LineParser)(YelpManParser *, GError **);
140 #define DECLARE_LINE_PARSER(name) \
141     static gboolean (name) (YelpManParser *parser, GError **error);
142 
143 DECLARE_LINE_PARSER (parse_xf)
144 DECLARE_LINE_PARSER (parse_f)
145 DECLARE_LINE_PARSER (parse_V)
146 DECLARE_LINE_PARSER (parse_H)
147 DECLARE_LINE_PARSER (parse_v)
148 DECLARE_LINE_PARSER (parse_h)
149 DECLARE_LINE_PARSER (parse_text)
150 DECLARE_LINE_PARSER (parse_w)
151 DECLARE_LINE_PARSER (parse_body_text)
152 DECLARE_LINE_PARSER (parse_n)
153 DECLARE_LINE_PARSER (parse_N)
154 DECLARE_LINE_PARSER (parse_C)
155 DECLARE_LINE_PARSER (parse_p)
156 
157 /* Declare a sort of alist registry of parsers for different lines. */
158 struct LineParsePair
159 {
160     const gchar *prefix;
161     LineParser handler;
162 };
163 static struct LineParsePair line_parsers[] = {
164     { "x f", parse_xf }, { "f", parse_f },
165     { "V", parse_V }, { "H", parse_H },
166     { "v", parse_v }, { "h", parse_h },
167     { "t", parse_text },
168     { "w", parse_w },
169     { "n", parse_n },
170     { "N", parse_N },
171     { "C", parse_C },
172     { "p", parse_p },
173     { NULL, NULL }
174 };
175 
176 /******************************************************************************/
177 /* Parser helper functions (managing the state of the various parsing
178  * bits) */
179 static void finish_span (YelpManParser *parser);
180 static guint dx_to_em_count (YelpManParser *parser, guint dx);
181 static void append_nbsps (YelpManParser *parser, guint k);
182 static void deal_with_newlines (YelpManParser *parser);
183 static void new_sheet (YelpManParser *parser);
184 static void register_title (YelpManParser *parser,
185                             const gchar* name, const gchar* section);
186 static void right_truncate_common (gchar *dst, const gchar *src);
187 static gboolean cheeky_call_parse_line (YelpManParser *parser,
188                                         GError **error,
189                                         gchar first_char,
190                                         const gchar *text);
191 static void cleanup_parsed_page (YelpManParser *parser);
192 static gboolean parse_last_line (YelpManParser *parser, gchar* line);
193 static void unicode_strstrip (gchar *str);
194 
195 /*
196   A link_inserter takes
197     (1) an array of offsets for the different spans within the string
198     (2) the match info from the regex match
199 
200   It's then responsible for mangling the XML tree to insert the actual
201   link. Finally, it should return the offset into the string of the
202   end of what it's just dealt with. If necessary, it should also fix
203   up offsets to point correctly at the last node inserted.
204  */
205 typedef struct {
206     gsize      start, end;
207     xmlNodePtr elt;
208 } offset_elt_pair;
209 
210 typedef gsize (*link_inserter)(offset_elt_pair *,
211                                const GMatchInfo *);
212 
213 static void fixup_links (YelpManParser *parser,
214                          const GRegex *matcher,
215                          link_inserter inserter);
216 
217 static gsize man_link_inserter (offset_elt_pair *offsets,
218                                 const GMatchInfo *match_info);
219 static gsize http_link_inserter (offset_elt_pair *offsets,
220                                  const GMatchInfo *match_info);
221 
222 /******************************************************************************/
223 /* Translations for the 'C' command. This is indeed hackish, but the
224  * -Tutf8 output doesn't seem to give include files so we can do this
225  * at runtime :-(
226  *
227  * On my machine, this data's at /usr/share/groff/current/tmac/ in
228  * latin1.tmac, unicode.tmac and I worked out the lq and rq from
229  * running man: I'm not sure where that comes from!
230  */
231 struct StringPair
232 {
233     const gchar *from;
234     gunichar to;
235 };
236 static const struct StringPair char_translations[] = {
237     { "r!", 161 },
238     { "ct", 162 },
239     { "Po", 163 },
240     { "Cs", 164 },
241     { "Ye", 165 },
242     { "bb", 166 },
243     { "sc", 167 },
244     { "ad", 168 },
245     { "co", 169 },
246     { "Of", 170 },
247     { "Fo", 171 },
248     { "tno", 172 },
249     { "%", 173 },
250     { "rg", 174 },
251     { "a-", 175 },
252     { "de", 176 },
253     { "t+-", 177 },
254     { "S2", 178 },
255     { "S3", 179 },
256     { "aa", 180 },
257     { "mc", 181 },
258     { "ps", 182 },
259     { "pc", 183 },
260     { "ac", 184 },
261     { "S1", 185 },
262     { "Om", 186 },
263     { "Fc", 187 },
264     { "14", 188 },
265     { "12", 189 },
266     { "34", 190 },
267     { "r?", 191 },
268     { "`A", 192 },
269     { "'A", 193 },
270     { "^A", 194 },
271     { "~A", 195 },
272     { ":A", 196 },
273     { "oA", 197 },
274     { "AE", 198 },
275     { ",C", 199 },
276     { "`E", 200 },
277     { "'E", 201 },
278     { "^E", 202 },
279     { ":E", 203 },
280     { "`I", 204 },
281     { "'I", 205 },
282     { "^I", 206 },
283     { ":I", 207 },
284     { "-D", 208 },
285     { "~N", 209 },
286     { "`O", 210 },
287     { "'O", 211 },
288     { "^O", 212 },
289     { "~O", 213 },
290     { ":O", 214 },
291     { "tmu", 215 },
292     { "/O", 216 },
293     { "`U", 217 },
294     { "'U", 218 },
295     { "^U", 219 },
296     { ":U", 220 },
297     { "'Y", 221 },
298     { "TP", 222 },
299     { "ss", 223 },
300     { "`a", 224 },
301     { "'a", 225 },
302     { "^a", 226 },
303     { "~a", 227 },
304     { ":a", 228 },
305     { "oa", 229 },
306     { "ae", 230 },
307     { ",c", 231 },
308     { "`e", 232 },
309     { "'e", 233 },
310     { "^e", 234 },
311     { ":e", 235 },
312     { "`i", 236 },
313     { "'i", 237 },
314     { "^i", 238 },
315     { ":i", 239 },
316     { "Sd", 240 },
317     { "~n", 241 },
318     { "`o", 242 },
319     { "'o", 243 },
320     { "^o", 244 },
321     { "~o", 245 },
322     { ":o", 246 },
323     { "tdi", 247 },
324     { "/o", 248 },
325     { "`u", 249 },
326     { "'u", 250 },
327     { "^u", 251 },
328     { ":u", 252 },
329     { "'y", 253 },
330     { "Tp", 254 },
331     { ":y", 255 },
332     { "hy", '-' },
333     { "oq", '`' },
334     { "cq", '\'' },
335     { "lq", 8220 }, // left smart quotes
336     { "rq", 8221 }, // right smart quotes
337     { "en", 8211 }, // en-dash
338     { "em", 8212 }, // em-dash
339     { "la", 10216 }, // left angle bracket
340     { "ra", 10217 }, // left angle bracket
341     { "rs", '\\' },
342     { "<=", 8804 }, // < or equal to sign
343     { ">=", 8805 }, // > or equal to sign
344     { "aq", '\'' },
345     { "tm", 8482 }, // trademark symbol
346     { NULL, 0 }
347 };
348 
349 /******************************************************************************/
350 
351 YelpManParser *
yelp_man_parser_new(void)352 yelp_man_parser_new (void)
353 {
354     YelpManParser *parser = g_new0 (YelpManParser, 1);
355     parser->accumulator = g_string_sized_new (1024);
356     return parser;
357 }
358 
359 /*
360   This function is responsible for taking a path to a man file and
361   returning something in the groff intermediate output format for us
362   to use.
363 
364   If something goes wrong, we return NULL and set error to be a
365   YelpError describing the problem.
366 */
367 static GInputStream*
get_troff(gchar * path,GError ** error)368 get_troff (gchar *path, GError **error)
369 {
370     gint ystdout;
371     GError *err = NULL;
372     const gchar *argv[] = { "/usr/local/libexec", "/yelp-groff", path, NULL };
373     gchar **my_argv;
374 
375     /* g_strdupv() should accept a "const gchar **". */
376     my_argv = g_strdupv ((gchar **) argv);
377 
378     if (!g_spawn_async_with_pipes (NULL, my_argv, NULL,
379                                    G_SPAWN_SEARCH_PATH, NULL, NULL,
380                                    NULL, NULL, &ystdout, NULL, &err)) {
381         /* We failed to run the man program. Return a "Huh?" error. */
382         *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN,
383                               "%s", err->message);
384         g_error_free (err);
385         g_strfreev (my_argv);
386         return NULL;
387     }
388 
389     g_strfreev (my_argv);
390 
391     return (GInputStream*) g_unix_input_stream_new (ystdout, TRUE);
392 }
393 
394 xmlDocPtr
yelp_man_parser_parse_file(YelpManParser * parser,gchar * path,GError ** error)395 yelp_man_parser_parse_file (YelpManParser *parser,
396                             gchar *path,
397                             GError **error)
398 {
399     GInputStream *troff_stream;
400     gboolean ret;
401     xmlNodePtr root;
402 
403     troff_stream = get_troff (path, error);
404     if (!troff_stream) return NULL;
405 
406     parser->stream = g_data_input_stream_new (troff_stream);
407 
408     parser->doc = xmlNewDoc (BAD_CAST "1.0");
409     root = xmlNewNode (NULL, BAD_CAST "Man");
410     xmlDocSetRootElement (parser->doc, root);
411 
412     parser->header = xmlNewNode (NULL, BAD_CAST "header");
413     xmlAddChild (root, parser->header);
414 
415     while (1) {
416        parser->buffer =
417        g_data_input_stream_read_line (parser->stream,
418                                       &(parser->length),
419                                       NULL, NULL);
420        if (parser->buffer == NULL) break;
421 
422        parser->line_no++;
423        ret = parser_parse_line (parser, error);
424 
425        g_free (parser->buffer);
426 
427        if (!ret) {
428            xmlFreeDoc (parser->doc);
429            parser->doc = NULL;
430            break;
431        }
432     }
433 
434     cleanup_parsed_page (parser);
435 
436     g_object_unref (parser->stream);
437 
438     return parser->doc;
439 }
440 
441 void
yelp_man_parser_free(YelpManParser * parser)442 yelp_man_parser_free (YelpManParser *parser)
443 {
444     guint k;
445 
446     if (parser == NULL)
447         return;
448 
449     for (k=0; k<MAN_FONTS; k++)
450         g_free (parser->font_registers[k]);
451     g_string_free (parser->accumulator, TRUE);
452     g_free (parser->title_str);
453     g_free (parser->section);
454     g_free (parser);
455 }
456 
457 /******************************************************************************/
458 
459 /* Sets the k'th font register to be name. Copies name, so free it
460  * afterwards. k should be in [0,MAN_FONTS). It seems that man always
461  * gives us ones at least 1, but groff_out(5) says non-negative.
462  */
463 static void
set_font_register(YelpManParser * parser,guint k,const gchar * name)464 set_font_register (YelpManParser *parser, guint k, const gchar* name)
465 {
466     if (k >= MAN_FONTS) {
467         g_warning ("Tried to set nonexistant font register %u to %s",
468                    k, name);
469         return;
470     }
471     g_free (parser->font_registers[k]);
472     parser->font_registers[k] = g_strdup (name);
473 }
474 
475 static const gchar*
get_font(const YelpManParser * parser)476 get_font (const YelpManParser *parser)
477 {
478     guint k = parser->current_font;
479     if (k >= MAN_FONTS ||
480         parser->font_registers[k] == NULL) {
481 
482         g_warning ("Tried to get nonexistant font register %u", k);
483 
484         return "";
485     }
486 
487     return parser->font_registers[k];
488 }
489 
490 /******************************************************************************/
491 
492 /*
493   Convenience macros to scan a string, checking for the correct number
494   of things read.
495 
496   Also to raise an error. Add an %s to the end of the format string,
497   which automatically gets given parser->buffer.
498  */
499 #define SSCANF(fmt,num,...)                                 \
500     (sscanf (parser->buffer, (fmt), __VA_ARGS__) != (num))
501 
502 #define PARSE_ERROR(...)                                    \
503     g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING,         \
504                  __VA_ARGS__, parser->buffer)
505 #define RAISE_PARSE_ERROR(...)                              \
506     { *error = PARSE_ERROR (__VA_ARGS__); return FALSE; }
507 
508 static gboolean
parser_parse_line(YelpManParser * parser,GError ** error)509 parser_parse_line (YelpManParser *parser, GError **error)
510 {
511     const struct LineParsePair *p;
512 
513     if (parser->line_no <= 3)
514         return parse_prologue_line (parser, error);
515 
516     p = line_parsers;
517     while (p->handler != NULL) {
518         if (g_str_has_prefix (parser->buffer, p->prefix)) {
519             return p->handler(parser, error);
520         }
521         p++;
522     }
523     return TRUE;
524 }
525 
526 static gboolean
parse_prologue_line(YelpManParser * parser,GError ** error)527 parse_prologue_line (YelpManParser *parser, GError **error)
528 {
529     if (parser->line_no != 2) return TRUE;
530 
531     /* This is the interesting line, which should look like
532               x res 240 24 40
533        The interesting bits are the 24 and the 40, which are the
534        width and height of a character as far as -Tutf8 is
535        concerned.
536     */
537     if (SSCANF ("x %*s %*u %u %u", 2,
538                 &parser->char_width, &parser->char_height)) {
539         RAISE_PARSE_ERROR ("Wrong 'x res' line from troff: %s");
540     }
541 
542     return TRUE;
543 }
544 
545 static gboolean
parse_xf(YelpManParser * parser,GError ** error)546 parse_xf (YelpManParser *parser, GError **error)
547 {
548     gchar name[11];
549     guint k;
550 
551     if (SSCANF ("x f%*s %u %10s", 2, &k, name)) {
552         RAISE_PARSE_ERROR ("Invalid 'x f' line from troff: %s");
553     }
554     set_font_register (parser, k, name);
555     return TRUE;
556 }
557 
558 static gboolean
parse_f(YelpManParser * parser,GError ** error)559 parse_f (YelpManParser *parser, GError **error)
560 {
561     guint k;
562     if (SSCANF ("f%u", 1, &k)) {
563         RAISE_PARSE_ERROR ("Invalid font line from troff: %s");
564     }
565     finish_span (parser);
566 
567     parser->current_font = k;
568 
569     return TRUE;
570 }
571 
572 static gboolean
parse_v(YelpManParser * parser,GError ** error)573 parse_v (YelpManParser *parser, GError **error)
574 {
575     guint dy;
576     if (SSCANF ("v%u", 1, &dy)) {
577         RAISE_PARSE_ERROR ("Invalid v line from troff: %s");
578     }
579     parser->last_vertical_jump += dy;
580     parser->vpos += dy;
581     return TRUE;
582 }
583 
584 static gboolean
parse_h(YelpManParser * parser,GError ** error)585 parse_h (YelpManParser *parser, GError **error)
586 {
587     guint dx;
588     int k;
589 
590     if (SSCANF ("h%u", 1, &dx)) {
591         RAISE_PARSE_ERROR ("Invalid h line from troff: %s");
592     }
593     parser->hpos += dx;
594 
595     /* This is a bit hackish to be honest but... if we're in something
596      * that'll end up in a span, a spacing h command means that a gap
597      * should appear. It seems that the easiest way to get this is to
598      * insert nonbreaking spaces (eugh!)
599      *
600      * Of course we don't want to do this when chained from wh24 or
601      * whatever, so use the last_char_was_space flag
602      * but... unfortunately some documents actually use stuff like
603      * wh96 for spacing (eg the lists in perl(1)). So (very hackish!),
604      * ignore double spaces, since that's probably just been put in to
605      * make the text justified (eugh), but allow bigger jumps.
606      *
607      * Incidentally, the perl manual here has bizarre gaps in the
608      * synopsis section. God knows why, but man displays them too so
609      * it's not our fault! :-)
610      */
611     k = dx_to_em_count (parser, dx);
612 
613     if ((!parser->last_char_was_space) || (k > 2)) {
614 
615         k -= parser->N_count;
616         if (k < 0) k = 0;
617 
618         append_nbsps (parser, k);
619     }
620 
621     parser->N_count = 0;
622 
623     return TRUE;
624 }
625 
626 static gboolean
parse_V(YelpManParser * parser,GError ** error)627 parse_V (YelpManParser *parser, GError **error)
628 {
629     guint y;
630     if (SSCANF ("V%u", 1, &y)) {
631         RAISE_PARSE_ERROR ("Invalid V line from troff: %s");
632     }
633     parser->last_vertical_jump += y - parser->vpos;
634     parser->vpos = y;
635     return TRUE;
636 }
637 
638 static gboolean
parse_H(YelpManParser * parser,GError ** error)639 parse_H (YelpManParser *parser, GError **error)
640 {
641     guint x;
642     if (SSCANF ("H%u", 1, &x)) {
643         RAISE_PARSE_ERROR ("Invalid H line from troff: %s");
644     }
645     parser->hpos = x;
646     return TRUE;
647 }
648 
649 static gboolean
parse_text(YelpManParser * parser,GError ** error)650 parse_text (YelpManParser *parser, GError **error)
651 {
652     gchar *text, *section, *tmp;
653     const gchar *acc;
654 
655     /*
656       Sneakily, this might get called with something other than t
657       starting the buffer: see parse_C and parse_N.
658     */
659     if (parser->buffer[0] == 't') {
660         parser->N_count = 0;
661     }
662 
663     if (parser->state == START) {
664         /* This should be the 'Title String(1)' line. It might come in
665          * chunks (for example, it might be more than one line
666          * long!). So just read bits until we get a (blah) bit: stick
667          * everything in the accumulator and check for
668          * parentheses. When we've got some, stick the parsed title in
669          * the header and switch to HAVE_TITLE.
670          *
671          * The parse_n code will error out if we didn't manage to get
672          * a title before the first newline and otherwise is in charge
673          * of switching to body-parsing mode.
674          */
675         g_string_append (parser->accumulator, parser->buffer+1);
676 
677         acc = parser->accumulator->str;
678 
679         section = strchr (acc, '(');
680 
681         if (section) {
682             section++;
683             tmp = strchr (section, ')');
684         }
685 
686         if (section && tmp) {
687             /* We've got 'Blah (3)' or the like in the accumulator */
688             if (*(tmp+1) != '\0') {
689                 RAISE_PARSE_ERROR ("Don't understand title line: '%s'");
690             }
691             parser->state = HAVE_TITLE;
692             parser->title_str = g_strdup (acc);
693 
694             text = g_strndup (acc, (section - 1) - acc);
695             section = g_strndup (section, tmp - section);
696 
697             register_title (parser, text, section);
698 
699             g_string_truncate (parser->accumulator, 0);
700 
701             g_free (text);
702             parser->section = section;
703         }
704 
705         return TRUE;
706     }
707 
708     if (parser->state == BODY)
709         return parse_body_text (parser, error);
710 
711     /* In state HAVE_TITLE */
712     else {
713         /* We expect (maybe!) to get some lines in between the two
714          * occurrences of the title itself. So collect up all the text
715          * we get and then we'll remove the copy of the title at the
716          * end (hopefully) when we find a newline in parse_n.
717          */
718         g_string_append (parser->accumulator, parser->buffer+1);
719         return TRUE;
720     }
721 }
722 
723 static gboolean
parse_body_text(YelpManParser * parser,GError ** error)724 parse_body_text (YelpManParser *parser, GError **error)
725 {
726     /*
727       It's this function which is responsible for trying to get *some*
728       semantic information back out of the manual page.
729 
730       The highest-level chopping up is into sections. We use the
731       heuristic that if either
732         (1) We haven't got a section yet or
733         (2) text starts a line (hpos=0)
734       then it's a section title.
735 
736       It's possible to have spaces in section titles, so we carry on
737       accumulating the section title until the next newline.
738     */
739     if (parser->section_state == SECTION_BODY &&
740         (!parser->section_node || (parser->hpos == 0))) {
741         g_string_truncate (parser->accumulator, 0);
742         /* End the current sheet & section */
743         parser->section_state = SECTION_TITLE;
744         parser->sheet_node = NULL;
745 
746         parser->section_node =
747             xmlAddChild (xmlDocGetRootElement (parser->doc),
748                          xmlNewNode (NULL, BAD_CAST "section"));
749     }
750 
751     if (parser->section_state != SECTION_TITLE) {
752         deal_with_newlines (parser);
753     }
754 
755     g_string_append (parser->accumulator, parser->buffer+1);
756 
757     /* Move hpos forward per char */
758     parser->hpos += strlen (parser->buffer+1) * parser->char_width;
759 
760     parser->last_char_was_space = FALSE;
761 
762     return TRUE;
763 }
764 
765 /*
766   w is a sort of prefix argument. It indicates a space, so we register
767   that here, then call parser_parse_line again on the rest of the
768   string to deal with that.
769  */
770 static gboolean
parse_w(YelpManParser * parser,GError ** error)771 parse_w (YelpManParser *parser, GError **error)
772 {
773     gboolean ret;
774 
775     if (parser->state != START) {
776         g_string_append_c (parser->accumulator, ' ');
777     }
778 
779     parser->buffer++;
780     parser->last_char_was_space = TRUE;
781 
782     ret = parser_parse_line (parser, error);
783 
784     parser->buffer--;
785     return ret;
786 }
787 
788 static gboolean
parse_n(YelpManParser * parser,GError ** error)789 parse_n (YelpManParser *parser, GError **error)
790 {
791     xmlNodePtr node;
792 
793     /* When we're in the header, the parse_n is responsible for
794      * switching to body text. (See the body of parse_text() for more
795      * of an explanation).
796      */
797     if (parser->state == START) {
798         /* Oh no! We've not got a proper title yet! Ho hum, let's
799            stick whatever's going into a 'title title' and have a null
800            section. Sob.
801         */
802         register_title (parser,
803                         parser->accumulator->str,
804                         "unknown section");
805         g_string_truncate (parser->accumulator, 0);
806         parser->state = BODY;
807         return TRUE;
808     }
809 
810     if (parser->state == HAVE_TITLE) {
811         /* What we've got so far is the manual's collection, followed
812            by the title again. So we want to get rid of the latter if
813            possible...
814         */
815         right_truncate_common (parser->accumulator->str,
816                                parser->title_str);
817         unicode_strstrip (parser->accumulator->str);
818 
819         xmlNewTextChild (parser->header,
820                          NULL, BAD_CAST "collection",
821                          BAD_CAST parser->accumulator->str);
822         g_string_truncate (parser->accumulator, 0);
823         parser->state = BODY;
824         parser->section_state = SECTION_BODY;
825         return TRUE;
826     }
827 
828     /* parser->state == BODY */
829     if (parser->section_state == SECTION_TITLE) {
830 
831         g_strchomp (parser->accumulator->str);
832         xmlNewTextChild (parser->section_node, NULL,
833                          BAD_CAST "title",
834                          BAD_CAST parser->accumulator->str);
835         g_string_truncate (parser->accumulator, 0);
836 
837         parser->section_state = SECTION_BODY;
838     }
839     else if (parser->sheet_node != NULL) {
840         /*
841           In the body of a section, when we get to a newline we should
842           have an accumulator with text in it and a non-null sheet
843           (hopefully!).
844 
845           We know the current font, so add a span for that font
846           containing the relevant text. Then add a <br/> tag.
847         */
848         finish_span (parser);
849         node = xmlNewNode (NULL, BAD_CAST "br");
850         xmlAddChild (parser->sheet_node, node);
851     }
852 
853     parser->newline = TRUE;
854     parser->last_char_was_space = FALSE;
855 
856     return TRUE;
857 }
858 
859 static void
finish_span(YelpManParser * parser)860 finish_span (YelpManParser *parser)
861 {
862     xmlNodePtr node;
863 
864     if (parser->accumulator->str[0] != '\0') {
865         node = xmlNewTextChild (parser->sheet_node, NULL,
866                                 BAD_CAST "span",
867                                 BAD_CAST parser->accumulator->str);
868         xmlNewProp (node, BAD_CAST "class",
869                     BAD_CAST get_font (parser));
870         g_string_truncate (parser->accumulator, 0);
871     }
872 }
873 
874 static guint
dx_to_em_count(YelpManParser * parser,guint dx)875 dx_to_em_count (YelpManParser *parser, guint dx)
876 {
877     return (int)(dx / ((float)parser->char_width));
878 }
879 
880 static gboolean
parse_N(YelpManParser * parser,GError ** error)881 parse_N (YelpManParser *parser, GError **error)
882 {
883     gint n;
884     gchar tmp[2];
885 
886     if (SSCANF ("N%i", 1, &n)) {
887         RAISE_PARSE_ERROR ("Strange format for N line: %s");
888     }
889     if (n > 127) {
890         RAISE_PARSE_ERROR ("N line has non-7-bit character: %s");
891     }
892     if (n < -200) {
893         RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s");
894     }
895 
896     if (n < 0) {
897         append_nbsps (parser, -n);
898         parser->N_count += -n;
899         return TRUE;
900     }
901 
902     parser->N_count++;
903 
904     tmp[0] = (gchar)n;
905     tmp[1] = '\0';
906 
907     return cheeky_call_parse_line (parser, error, 'N', tmp);
908 }
909 
910 static void
append_nbsps(YelpManParser * parser,guint k)911 append_nbsps (YelpManParser *parser, guint k)
912 {
913     for (; k > 0; k--) {
914         /* 0xc2 0xa0 is nonbreaking space in utf8 */
915         g_string_append_c (parser->accumulator, 0xc2);
916         g_string_append_c (parser->accumulator, 0xa0);
917     }
918 }
919 
920 static gboolean
parse_C(YelpManParser * parser,GError ** error)921 parse_C (YelpManParser *parser, GError **error)
922 {
923     gchar name[17];
924     gunichar code = 0;
925     guint k;
926     gint len;
927 
928     if (SSCANF ("C%16s", 1, name)) {
929         RAISE_PARSE_ERROR ("Can't understand special character: %s");
930     }
931 
932     for (k=0; char_translations[k].from; k++) {
933         if (g_str_equal (char_translations[k].from, name)) {
934             code = char_translations[k].to;
935             break;
936         }
937     }
938     if (sscanf (name, "u%x", &k) == 1) {
939         code = k;
940     }
941 
942     if (!code) {
943         g_warning ("Couldn't parse troff special character: '%s'",
944                    name);
945         code = 65533; /* Unicode replacement character */
946     }
947 
948     /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */
949     len = g_unichar_to_utf8 (code, name);
950     name[len] = '\0';
951 
952     parser->N_count++;
953 
954     return cheeky_call_parse_line (parser, error, 'C', name);
955 }
956 
957 static void
deal_with_newlines(YelpManParser * parser)958 deal_with_newlines (YelpManParser *parser)
959 {
960     /*
961       If newline is true, this is the first word on a line.
962 
963       In which case, we check to see whether hpos agrees with the
964       current sheet's indent. If so (or if there isn't a sheet yet!),
965       we just add to the accumulator. If not, start a new sheet with
966       the correct indent.
967 
968       If we aren't the first word on the line, just add to the
969       accumulator.
970     */
971     gchar tmp[64];
972     guint jump_lines;
973     gboolean made_sheet = FALSE, dont_jump = FALSE;
974 
975     /* This only happens at the start of a section, where there's
976        already a gap
977     */
978     if (!parser->sheet_node) {
979         dont_jump = TRUE;
980     }
981 
982     if ((!parser->sheet_node) ||
983         (parser->newline && (parser->hpos != parser->sheet_indent))) {
984         new_sheet (parser);
985         made_sheet = TRUE;
986     }
987 
988     if (parser->newline) {
989         if ((parser->last_vertical_jump > 0) && (!dont_jump)) {
990             jump_lines =
991                 parser->last_vertical_jump/parser->char_height;
992         } else {
993             jump_lines = 1;
994         }
995 
996         if (jump_lines > 1) {
997             if (!made_sheet) new_sheet (parser);
998             made_sheet = TRUE;
999         }
1000 
1001         snprintf (tmp, 64, "%u", dx_to_em_count (parser, parser->hpos));
1002         xmlNewProp (parser->sheet_node,
1003                     BAD_CAST "indent", BAD_CAST tmp);
1004 
1005         if (made_sheet) {
1006             snprintf (tmp, 64, "%u", jump_lines-1);
1007             xmlNewProp (parser->sheet_node,
1008                         BAD_CAST "jump", BAD_CAST tmp);
1009         }
1010     }
1011 
1012     parser->newline = FALSE;
1013     parser->last_vertical_jump = 0;
1014 }
1015 
1016 static gboolean
parse_p(YelpManParser * parser,GError ** error)1017 parse_p (YelpManParser *parser, GError **error)
1018 {
1019     parser->vpos = 0;
1020     parser->hpos = 0;
1021     return TRUE;
1022 }
1023 
1024 static void
new_sheet(YelpManParser * parser)1025 new_sheet (YelpManParser *parser)
1026 {
1027     /* We don't need to worry about finishing the current sheet,
1028        since the accumulator etc. get cleared on newlines and we
1029        know we're at the start of a line.
1030     */
1031     parser->sheet_node =
1032         xmlAddChild (parser->section_node,
1033                      xmlNewNode (NULL, BAD_CAST "sheet"));
1034     parser->sheet_indent = parser->hpos;
1035 }
1036 
1037 static void
register_title(YelpManParser * parser,const gchar * name,const gchar * section)1038 register_title (YelpManParser *parser,
1039                 const gchar* name, const gchar* section)
1040 {
1041     xmlNewTextChild (parser->header,
1042                      NULL, BAD_CAST "title", BAD_CAST name);
1043     xmlNewTextChild (parser->header,
1044                      NULL, BAD_CAST "section", BAD_CAST section);
1045 }
1046 
1047 static void
right_truncate_common(gchar * dst,const gchar * src)1048 right_truncate_common (gchar *dst, const gchar *src)
1049 {
1050     guint len_src = strlen (src);
1051     guint len_dst = strlen (dst);
1052 
1053     guint k = (len_src < len_dst) ? len_src - 1 : len_dst - 1;
1054 
1055     dst += len_dst - 1;
1056     src += len_src - 1;
1057 
1058     while (k > 0) {
1059         if (*dst != *src) break;
1060         *dst = '\0';
1061 
1062         k--;
1063         dst--;
1064         src--;
1065     }
1066 }
1067 
1068 static gboolean
cheeky_call_parse_line(YelpManParser * parser,GError ** error,gchar first_char,const gchar * text)1069 cheeky_call_parse_line (YelpManParser *parser, GError **error,
1070                         gchar first_char, const gchar* text)
1071 {
1072     /* Do a cunning trick. There's all sorts of code that parse_text
1073      * does, which we don't want to duplicate in parse_N and
1074      * parse_C. So feed a buffer back to parse_text. Tada! Start it
1075      * with "C" or "N" rather than "t" so clever stuff in parse_text
1076      * can tell the difference.
1077      */
1078     gchar *tmp;
1079     gboolean ret;
1080     guint len = strlen (text);
1081 
1082     tmp = parser->buffer;
1083     parser->buffer = g_new (gchar, 2 + len);
1084     parser->buffer[0] = first_char;
1085     strncpy (parser->buffer + 1, text, len + 1);
1086 
1087     ret = parse_text (parser, error);
1088 
1089     g_free (parser->buffer);
1090     parser->buffer = tmp;
1091 
1092     return ret;
1093 }
1094 
1095 static void
cleanup_parsed_page(YelpManParser * parser)1096 cleanup_parsed_page (YelpManParser *parser)
1097 {
1098     /* First job: the last line usually has the version, date and
1099      * title (again!). The code above misunderstands and parses this
1100      * as a section, so we need to "undo" this and stick the data in
1101      * the header where it belongs.
1102      *
1103      * parser->section_node should still point to it. We assume this
1104      * has happened if it has exactly one child element (the <title>
1105      * tag)
1106      */
1107     gchar *lastline;
1108     GRegex *regex;
1109     gchar regex_string [1024];
1110 
1111     if (xmlChildElementCount (parser->section_node) == 1) {
1112         lastline = (gchar *)xmlNodeGetContent (parser->section_node);
1113 
1114         /* If parse_last_line works, it sets the data from it in the
1115            <header> tag, so delete the final section. */
1116         if (parse_last_line (parser, lastline)) {
1117             xmlUnlinkNode (parser->section_node);
1118             xmlFreeNode (parser->section_node);
1119         }
1120         else {
1121             /* Oh dear. This would be unexpected and doesn't seem to
1122                happen with man on my system. But we probably shouldn't
1123                ditch the info, so let's leave the <section> tag and
1124                print a warning message to the console.
1125             */
1126             g_warning ("Unexpected final line in man document (%s)\n",
1127                        lastline);
1128         }
1129 
1130         xmlFree (lastline);
1131     }
1132 
1133     /* Next job: Go through and stick the links in. Text that looks
1134      * like man(1) should be converted to a link to man:man(1) and
1135      * urls should also be linkified.
1136      *
1137      * Unfortunately, it's not entirely clear what constitutes a valid
1138      * section. All sections must be alphanumeric and the logic we use
1139      * to avoid extra hits (eg "one or more widget(s)") is that either
1140      * the section must start with a digit or (if the current section
1141      * doesn't) must start with the same letter as the current
1142      * section.
1143      */
1144     snprintf (regex_string, 1024,
1145               "([a-zA-Z0-9\\-_.:]+)\\(((%c|[0-9])[a-zA-Z0-9]*)\\)",
1146               parser->section ? parser->section[0] : '0');
1147     regex = g_regex_new (regex_string, 0, 0, NULL);
1148     g_return_if_fail (regex);
1149     fixup_links (parser, regex, man_link_inserter);
1150     g_regex_unref (regex);
1151 
1152     /* Now for http:// links.
1153      */
1154     regex = g_regex_new ("https?:\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+"
1155                          "([\\w\\-\\.,@?^=%&:/~\\+#]*"
1156                          "[\\w\\-\\@?^=%&/~\\+#])?",
1157                          0, 0, NULL);
1158     g_return_if_fail (regex);
1159     fixup_links (parser, regex, http_link_inserter);
1160     g_regex_unref (regex);
1161 }
1162 
1163 static gchar *
skip_whitespace(gchar * text)1164 skip_whitespace (gchar *text)
1165 {
1166     while (g_unichar_isspace (g_utf8_get_char (text))) {
1167         text = g_utf8_next_char (text);
1168     }
1169     return text;
1170 }
1171 
1172 static gchar *
last_non_whitespace(gchar * text)1173 last_non_whitespace (gchar *text)
1174 {
1175     gchar *end = text + strlen(text);
1176     gchar *prev;
1177 
1178     prev = g_utf8_find_prev_char (text, end);
1179     if (!prev) {
1180         /* The string must have been zero-length. */
1181         return NULL;
1182     }
1183 
1184     while (g_unichar_isspace (g_utf8_get_char (prev))) {
1185         end = prev;
1186         prev = g_utf8_find_prev_char (text, prev);
1187         if (!prev) return NULL;
1188     }
1189     return end;
1190 }
1191 
1192 static gchar *
find_contiguous_whitespace(gchar * text,guint ws_len)1193 find_contiguous_whitespace (gchar *text, guint ws_len)
1194 {
1195     guint counter = 0;
1196     gchar *ws_start = NULL;
1197     while (*text) {
1198         if (g_unichar_isspace (g_utf8_get_char (text))) {
1199             if (!counter) ws_start = text;
1200             counter++;
1201         }
1202         else counter = 0;
1203 
1204         if (counter == ws_len) return ws_start;
1205 
1206         text = g_utf8_next_char (text);
1207     }
1208     return NULL;
1209 }
1210 
1211 static gboolean
parse_last_line(YelpManParser * parser,gchar * line)1212 parse_last_line (YelpManParser *parser, gchar* line)
1213 {
1214     /* We expect a line of the form
1215            '1.2.3      blah 2009       libfoo(1)'
1216        where the spaces are all nbsp's.
1217 
1218        Look for a gap of at least 3 in a row. If we find that, expand
1219        either side and declare the stuff before to be the version
1220        number and then the stuff afterwards to be the start of the
1221        date. Then do the same thing on the next gap, if there is one.
1222     */
1223     gchar *gap, *date_start;
1224 
1225     gchar *version;
1226     gchar *date;
1227 
1228     gap = find_contiguous_whitespace (line, 3);
1229     if (!gap) return FALSE;
1230 
1231     version = g_strndup (line, gap - line);
1232 
1233     date_start = skip_whitespace (gap);
1234 
1235     gap = find_contiguous_whitespace (date_start, 3);
1236     if (!gap) return FALSE;
1237 
1238     date = g_strndup (date_start, gap - date_start);
1239 
1240     xmlNewProp (parser->header, BAD_CAST "version", BAD_CAST version);
1241     xmlNewProp (parser->header, BAD_CAST "date", BAD_CAST date);
1242 
1243     g_free (version);
1244     g_free (date);
1245 
1246     return TRUE;
1247 }
1248 
1249 /* This should work like g_strstrip, but that's an ASCII-only version
1250  * and I want to strip the nbsp's that I so thoughtfully plaster
1251  * stuff with...
1252  */
1253 static void
unicode_strstrip(gchar * str)1254 unicode_strstrip (gchar *str)
1255 {
1256     gchar *start, *end;
1257 
1258     if (str == NULL) return;
1259 
1260     end = last_non_whitespace (str);
1261 
1262     if (!end) {
1263         /* String is zero-length or entirely whitespace */
1264         *str = '\0';
1265         return;
1266     }
1267     start = skip_whitespace (str);
1268 
1269     memmove (str, start, end - start);
1270     *(str + (end - start)) = '\0';
1271 }
1272 
1273 static void
sheet_fixup_links(xmlNodePtr sheet,const GRegex * regex,link_inserter inserter)1274 sheet_fixup_links (xmlNodePtr sheet,
1275                    const GRegex *regex, link_inserter inserter)
1276 {
1277     /*
1278       This works as follows: grab (<span>) nodes from a sheet in
1279       order and stick their contents into a string. Since a sheet
1280       won't be ludicrously long, we can just grab everything and then
1281       work over it, but we need to keep track of which node points at
1282       which bit of the string so we can call inserter helpfully. To do
1283       so, use byte offsets, since that seems less likely to go
1284       horribly wrong!
1285     */
1286     GString *accumulator = g_string_new ("");
1287     xmlNodePtr span;
1288     xmlChar *tmp;
1289     gsize offset = 0;
1290     gsize len;
1291     offset_elt_pair pair;
1292     GMatchInfo *match_info;
1293 
1294     /* Make pairs zero-terminated so that code can iterate through it
1295      * looking for something with elt = NULL. */
1296     GArray *pairs = g_array_new (TRUE, FALSE,
1297                                  sizeof (offset_elt_pair));
1298 
1299     g_return_if_fail (regex);
1300     g_return_if_fail (inserter);
1301     g_return_if_fail (sheet);
1302 
1303     for (span = sheet->children; span != NULL; span = span->next) {
1304         if (span->type != XML_ELEMENT_NODE) continue;
1305 
1306         if (strcmp ((const char*) span->name, "span") != 0) {
1307 
1308             if (strcmp ((const char*) span->name, "a") == 0)
1309                 continue;
1310 
1311             if (strcmp ((const char*) span->name, "br") == 0) {
1312                 /* If the last character in the accumulator is a
1313                  * hyphen, we don't want to include that in the link
1314                  * we make. If not, append a newline to the
1315                  * accumulator (so we don't mistakenly make links from
1316                  * "see\nthis(2)" to seethis(2).
1317                  *
1318                  * Either way, we add the <br> to the list of pairs
1319                  * since we might need to do stuff with it if it's in
1320                  * the middle of a link.
1321                  */
1322                 len = strlen (accumulator->str);
1323                 if (len > 0 && accumulator->str [len-1] == '-') {
1324                     g_string_truncate (accumulator, len - 1);
1325                     offset--;
1326                 }
1327                 else {
1328                     g_string_append_c (accumulator, '\n');
1329                     offset++;
1330                 }
1331                 pair.start = offset;
1332                 pair.end = offset;
1333                 pair.elt = span; /* Er, br in fact. */
1334                 g_array_append_val (pairs, pair);
1335 
1336                 continue;
1337             }
1338 
1339             g_warning ("Expected all child elements to be "
1340                        "<span>, <br> or <a>, but "
1341                        "have found a <%s>.",
1342                        (gchar *) span->name);
1343             continue;
1344         }
1345 
1346         tmp = xmlNodeGetContent (span);
1347         g_string_append (accumulator, (gchar *) tmp);
1348         len = strlen ((const char*) tmp);
1349 
1350         pair.start = offset;
1351         pair.end = offset + len;
1352         pair.elt = span;
1353 
1354         g_array_append_val (pairs, pair);
1355 
1356         offset += len;
1357         xmlFree (tmp);
1358     }
1359 
1360     /* We've got the data. Now try to match the regex against it as
1361      * many times as possible
1362      */
1363     offset = 0;
1364     g_regex_match_full (regex, accumulator->str,
1365                         -1, offset, 0, &match_info, NULL);
1366     while (g_match_info_matches (match_info)) {
1367         offset = inserter ((offset_elt_pair *)pairs->data,
1368                            match_info);
1369 
1370         g_match_info_free (match_info);
1371 
1372         g_regex_match_full (regex, accumulator->str,
1373                             -1, offset, 0, &match_info, NULL);
1374     }
1375 
1376     g_string_free (accumulator, TRUE);
1377     g_array_unref (pairs);
1378 }
1379 
1380 static void
fixup_links(YelpManParser * parser,const GRegex * regex,link_inserter inserter)1381 fixup_links (YelpManParser *parser,
1382              const GRegex *regex, link_inserter inserter)
1383 {
1384     /* Iterate over all the <sheet>'s in the xml document */
1385     xmlXPathContextPtr context;
1386     xmlXPathObjectPtr path_obj;
1387     xmlNodeSetPtr nodeset;
1388     gint i;
1389 
1390     context = xmlXPathNewContext (parser->doc);
1391     g_return_if_fail (context);
1392 
1393     path_obj = xmlXPathEvalExpression (BAD_CAST "//sheet", context);
1394     g_return_if_fail (path_obj);
1395 
1396     nodeset = path_obj->nodesetval;
1397     g_return_if_fail (nodeset);
1398 
1399     for (i = 0; i < nodeset->nodeNr; ++i) {
1400         sheet_fixup_links (nodeset->nodeTab[i], regex, inserter);
1401     }
1402 
1403     xmlXPathFreeObject (path_obj);
1404     xmlXPathFreeContext (context);
1405 }
1406 
1407 /*
1408   This inserts new_child under parent. If older_sibling is non-NULL,
1409   we stick it immediately after it. Otherwise, insert as the first
1410   child of the parent.
1411 
1412   Returns the inserted child.
1413  */
1414 static xmlNodePtr
insert_child_after(xmlNodePtr parent,xmlNodePtr older_sibling,xmlNodePtr new_child)1415 insert_child_after (xmlNodePtr parent, xmlNodePtr older_sibling,
1416                     xmlNodePtr new_child)
1417 {
1418     g_return_val_if_fail (parent && new_child, new_child);
1419 
1420     if (older_sibling) {
1421         xmlAddNextSibling (older_sibling, new_child);
1422     }
1423     else if (parent->children == NULL) {
1424         xmlAddChild (parent, new_child);
1425     }
1426     else {
1427         xmlAddPrevSibling (parent->children, new_child);
1428     }
1429 
1430     return new_child;
1431 }
1432 
1433 static void
copy_prop(xmlNodePtr to,xmlNodePtr from,const xmlChar * name)1434 copy_prop (xmlNodePtr to, xmlNodePtr from, const xmlChar *name)
1435 {
1436     xmlChar *prop = xmlGetProp (from, name);
1437     g_return_if_fail (prop);
1438     xmlSetProp (to, name, prop);
1439     xmlFree (prop);
1440 }
1441 
1442 static gsize
do_node_replacement(xmlNodePtr anchor_node,offset_elt_pair * offsets,gsize startpos,gsize endpos)1443 do_node_replacement (xmlNodePtr anchor_node,
1444                      offset_elt_pair *offsets,
1445                      gsize startpos, gsize endpos)
1446 {
1447     xmlNodePtr node, sibling_before;
1448     gchar *gtmp;
1449     xmlChar *xtmp, *xshort;
1450     gsize look_from;
1451 
1452     /* Find the first element by searching through offsets. I suppose
1453      * a binary search would be cleverer, but I doubt that this will
1454      * take significant amounts of time.
1455      *
1456      * We should never fall off the end, but (just in case) the GArray
1457      * that holds the offsets is zero-terminated and elt should never
1458      * be NULL so we can stop if necessary
1459      */
1460     while ((offsets->end <= startpos) && offsets->elt) {
1461         offsets++;
1462     }
1463     g_return_val_if_fail (offsets->elt, endpos);
1464 
1465     /* xtmp is NULL by default, but we do this here so that if we read
1466      * the node in the if block below, we don't have to do it a second
1467      * time.
1468      */
1469     xtmp = NULL;
1470     sibling_before = offsets->elt->prev;
1471     look_from = startpos;
1472 
1473     /* Maybe there's text in the relevant span before the start of
1474      * the stuff we want to replace with a link.
1475      */
1476     if (startpos > offsets->start) {
1477         node = xmlNewNode (NULL, BAD_CAST "span");
1478         copy_prop (node, offsets->elt, BAD_CAST "class");
1479 
1480         xtmp = xmlNodeGetContent (offsets->elt);
1481         gtmp = g_strndup ((const gchar*)xtmp, startpos - offsets->start);
1482         xmlNodeAddContent (node, BAD_CAST gtmp);
1483         g_free (gtmp);
1484 
1485         sibling_before = insert_child_after (offsets->elt->parent,
1486                                              sibling_before, node);
1487     }
1488 
1489     insert_child_after (offsets->elt->parent,
1490                         sibling_before, anchor_node);
1491 
1492     /* The main loop. Here we work over each span that overlaps with
1493      * the link we're adding. We add a similar span as a child of the
1494      * anchor node and then delete the existing one.  */
1495     while (look_from < endpos) {
1496         if (!xtmp) xtmp = xmlNodeGetContent (offsets->elt);
1497 
1498         if (strcmp ((const gchar*)offsets->elt->name, "br") == 0) {
1499             node = xmlNewChild (anchor_node,
1500                                 NULL, BAD_CAST "br", NULL);
1501             xmlUnlinkNode (offsets->elt);
1502             xmlFreeNode (offsets->elt);
1503             xmlFree (xtmp);
1504             xtmp = NULL;
1505             offsets++;
1506         }
1507         else if (endpos < offsets->end) {
1508             xshort = BAD_CAST g_strndup ((const gchar*)xtmp,
1509                                          endpos - offsets->start);
1510 
1511             node = xmlNewChild (anchor_node, NULL, BAD_CAST "span",
1512                                 xshort + (look_from-offsets->start));
1513             copy_prop (node, offsets->elt, BAD_CAST "class");
1514 
1515             node = xmlNewNode (NULL, BAD_CAST "span");
1516             xmlNodeAddContent (node,
1517                                xtmp + (endpos - offsets->start));
1518             copy_prop (node, offsets->elt, BAD_CAST "class");
1519             xmlAddNextSibling (anchor_node, node);
1520 
1521             xmlFree (xshort);
1522 
1523             xmlUnlinkNode (offsets->elt);
1524             xmlFreeNode (offsets->elt);
1525             xmlFree (xtmp);
1526             xtmp = NULL;
1527 
1528             offsets->start = endpos;
1529             offsets->elt = node;
1530         }
1531         else {
1532             node = xmlNewChild (anchor_node, NULL, BAD_CAST "span",
1533                                 xtmp + (look_from - offsets->start));
1534             copy_prop (node, offsets->elt, BAD_CAST "class");
1535 
1536             xmlUnlinkNode (offsets->elt);
1537             xmlFreeNode (offsets->elt);
1538             xmlFree (xtmp);
1539             xtmp = NULL;
1540             offsets++;
1541         }
1542 
1543         if (!offsets->elt) {
1544             /* We got to the end of a sheet and of the stuff we're
1545              * doing at the same time
1546              */
1547             return endpos;
1548         }
1549 
1550         look_from = offsets->start;
1551     }
1552 
1553     return offsets->start;
1554 }
1555 
1556 static gsize
do_link_insertion(const gchar * url,offset_elt_pair * offsets,gsize startpos,gsize endpos)1557 do_link_insertion (const gchar *url,
1558                    offset_elt_pair *offsets,
1559                    gsize startpos, gsize endpos)
1560 {
1561     xmlNodePtr anchor_node = xmlNewNode (NULL, BAD_CAST "a");
1562 
1563     xmlNewProp (anchor_node, BAD_CAST "href", BAD_CAST url);
1564 
1565     return do_node_replacement (anchor_node, offsets,
1566                                 startpos, endpos);
1567 }
1568 
1569 static gsize
man_link_inserter(offset_elt_pair * offsets,const GMatchInfo * match_info)1570 man_link_inserter (offset_elt_pair *offsets,
1571                    const GMatchInfo *match_info)
1572 {
1573     gchar *name, *section;
1574     gchar url[1024];
1575 
1576     gint startpos, endpos;
1577 
1578     g_match_info_fetch_pos (match_info, 0, &startpos, &endpos);
1579 
1580     name = g_match_info_fetch (match_info, 1);
1581     section = g_match_info_fetch (match_info, 2);
1582 
1583     g_return_val_if_fail (name && section, endpos);
1584 
1585     snprintf (url, 1024, "man:%s(%s)", name, section);
1586 
1587     g_free (name);
1588     g_free (section);
1589 
1590     return do_link_insertion (url, offsets, startpos, endpos);
1591 }
1592 
1593 static gsize
http_link_inserter(offset_elt_pair * offsets,const GMatchInfo * match_info)1594 http_link_inserter (offset_elt_pair *offsets,
1595                     const GMatchInfo *match_info)
1596 {
1597     gchar *url;
1598     gint startpos, endpos;
1599     gsize ret;
1600 
1601     url = g_match_info_fetch (match_info, 0);
1602     g_match_info_fetch_pos (match_info, 0, &startpos, &endpos);
1603 
1604     ret = do_link_insertion (url, offsets, startpos, endpos);
1605 
1606     g_free (url);
1607 
1608     return ret;
1609 }
1610