1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
2 *
3 * Copyright (C) 2014-2016 Richard Hughes <richard@hughsie.com>
4 *
5 * SPDX-License-Identifier: LGPL-2.1+
6 */
7
8 /**
9 * SECTION:as-markup
10 * @short_description: Functions for managing AppStream description markup
11 * @include: appstream-glib.h
12 * @stability: Stable
13 *
14 * These functions are used internally to libappstream-glib, and some may be
15 * useful to user-applications.
16 */
17
18 #include "config.h"
19
20 #include <string.h>
21
22 #include "as-markup.h"
23 #include "as-node.h"
24 #include "as-utils.h"
25
26 typedef enum {
27 AS_MARKUP_TAG_UNKNOWN,
28 AS_MARKUP_TAG_PARA,
29 AS_MARKUP_TAG_OL,
30 AS_MARKUP_TAG_UL,
31 AS_MARKUP_TAG_LI,
32 AS_MARKUP_TAG_LAST
33 } AsMarkupTag;
34
35 typedef struct {
36 AsMarkupTag action;
37 GString *output;
38 GString *temp;
39 } AsMarkupImportHelper;
40
41 static void
as_markup_import_html_flush(AsMarkupImportHelper * helper)42 as_markup_import_html_flush (AsMarkupImportHelper *helper)
43 {
44 gchar *tmp;
45 guint i;
46 g_auto(GStrv) split = NULL;
47
48 /* trivial case */
49 if (helper->action == AS_MARKUP_TAG_UNKNOWN)
50 return;
51 if (helper->temp->len == 0)
52 return;
53
54 /* split into lines and strip */
55 split = g_strsplit (helper->temp->str, "\n", -1);
56 for (i = 0; split[i] != NULL; i++) {
57 tmp = g_strstrip (split[i]);
58 if (tmp[0] == '\0')
59 continue;
60 switch (helper->action) {
61 case AS_MARKUP_TAG_PARA:
62 g_string_append_printf (helper->output, "<p>%s</p>", tmp);
63 break;
64 case AS_MARKUP_TAG_LI:
65 g_string_append_printf (helper->output, "<li>%s</li>", tmp);
66 break;
67 default:
68 break;
69 }
70 }
71 g_string_truncate (helper->temp, 0);
72 }
73
74 static void
as_markup_import_html_set_tag(AsMarkupImportHelper * helper,AsMarkupTag action_new)75 as_markup_import_html_set_tag (AsMarkupImportHelper *helper, AsMarkupTag action_new)
76 {
77 if (helper->action == AS_MARKUP_TAG_UL &&
78 action_new == AS_MARKUP_TAG_LI) {
79 g_string_append (helper->output, "<ul>");
80 helper->action = action_new;
81 } else if (helper->action == AS_MARKUP_TAG_UL &&
82 action_new == AS_MARKUP_TAG_UNKNOWN) {
83 g_string_append (helper->output, "</ul>");
84 helper->action = action_new;
85 } else {
86 helper->action = action_new;
87 }
88 }
89
90 static void
as_markup_import_html_start_cb(GMarkupParseContext * context,const gchar * element_name,const gchar ** attribute_names,const gchar ** attribute_values,gpointer user_data,GError ** error)91 as_markup_import_html_start_cb (GMarkupParseContext *context,
92 const gchar *element_name,
93 const gchar **attribute_names,
94 const gchar **attribute_values,
95 gpointer user_data,
96 GError **error)
97 {
98 AsMarkupImportHelper *helper = (AsMarkupImportHelper *) user_data;
99
100 /* don't assume the document starts with <p> */
101 if (g_strcmp0 (element_name, "document") == 0 ||
102 g_strcmp0 (element_name, "p") == 0) {
103 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_PARA);
104 return;
105 }
106 if (g_strcmp0 (element_name, "li") == 0) {
107 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_LI);
108 return;
109 }
110 if (g_strcmp0 (element_name, "ul") == 0) {
111 as_markup_import_html_flush (helper);
112 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UL);
113 return;
114 }
115
116 /* do not include the contents of these tags */
117 if (g_strcmp0 (element_name, "h1") == 0 ||
118 g_strcmp0 (element_name, "h2") == 0) {
119 as_markup_import_html_flush (helper);
120 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UNKNOWN);
121 return;
122 }
123 }
124
125 static void
as_markup_import_html_end_cb(GMarkupParseContext * context,const gchar * element_name,gpointer user_data,GError ** error)126 as_markup_import_html_end_cb (GMarkupParseContext *context,
127 const gchar *element_name,
128 gpointer user_data,
129 GError **error)
130 {
131 AsMarkupImportHelper *helper = (AsMarkupImportHelper *) user_data;
132
133 if (g_strcmp0 (element_name, "document") == 0 ||
134 g_strcmp0 (element_name, "p") == 0) {
135 as_markup_import_html_flush (helper);
136 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UNKNOWN);
137 return;
138 }
139 /* don't assume the next section starts with <p> */
140 if (g_strcmp0 (element_name, "h1") == 0 ||
141 g_strcmp0 (element_name, "h2") == 0) {
142 as_markup_import_html_flush (helper);
143 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_PARA);
144 return;
145 }
146 if (g_strcmp0 (element_name, "li") == 0) {
147 as_markup_import_html_flush (helper);
148 /* not UL, else we do a new <ul> on next <li> */
149 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_LI);
150 return;
151 }
152 if (g_strcmp0 (element_name, "ul") == 0 ||
153 g_strcmp0 (element_name, "ol") == 0) {
154 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UL);
155 as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UNKNOWN);
156 return;
157 }
158 }
159
160 static void
as_markup_import_html_text_cb(GMarkupParseContext * context,const gchar * text,gsize text_len,gpointer user_data,GError ** error)161 as_markup_import_html_text_cb (GMarkupParseContext *context,
162 const gchar *text,
163 gsize text_len,
164 gpointer user_data,
165 GError **error)
166 {
167 AsMarkupImportHelper *helper = (AsMarkupImportHelper *) user_data;
168 g_autofree gchar *tmp = NULL;
169
170 if (helper->action == AS_MARKUP_TAG_UNKNOWN)
171 return;
172
173 /* do not assume text is NULL-terminated */
174 tmp = g_strndup (text, text_len);
175 g_string_append (helper->temp, tmp);
176 }
177
178 static void
as_markup_import_html_erase(GString * str,const gchar * start,const gchar * end)179 as_markup_import_html_erase (GString *str, const gchar *start, const gchar *end)
180 {
181 gssize start_len = (gssize) strlen (start);
182 gssize end_len = (gssize) strlen (end);
183 if (start_len + end_len > str->len)
184 return;
185 for (gssize i = 0; i < str->len - start_len; i++) {
186 if (memcmp (&str->str[i], start, (gsize) start_len) != 0)
187 continue;
188 for (gssize j = i; i < (gssize) str->len; j++) {
189 if (memcmp (&str->str[j], end, (gsize) end_len) != 0)
190 continue;
191 /* delete this section and restart the search */
192 g_string_erase (str, i, (j - i) + end_len);
193 i = -1;
194 break;
195 }
196 }
197 }
198
199 static gchar *
as_markup_import_html(const gchar * text,GError ** error)200 as_markup_import_html (const gchar *text, GError **error)
201 {
202 AsMarkupImportHelper helper;
203 GMarkupParser parser = {
204 as_markup_import_html_start_cb,
205 as_markup_import_html_end_cb,
206 as_markup_import_html_text_cb,
207 NULL,
208 NULL };
209 g_autoptr(GMarkupParseContext) ctx = NULL;
210 g_autoptr(GString) str = NULL;
211 g_autoptr(GString) helper_output = NULL;
212 g_autoptr(GString) helper_temp = NULL;
213
214 /* clean these up on failure */
215 helper_output = g_string_new ("");
216 helper_temp = g_string_new ("");
217
218 /* set up XML parser */
219 helper.action = AS_MARKUP_TAG_UNKNOWN;
220 helper.output = helper_output;
221 helper.temp = helper_temp;
222 ctx = g_markup_parse_context_new (&parser,
223 G_MARKUP_TREAT_CDATA_AS_TEXT,
224 &helper,
225 NULL);
226
227 /* ensure this has at least one set of quotes */
228 str = g_string_new ("");
229 g_string_append_printf (str, "<document>%s</document>", text);
230
231 /* convert win32 line endings */
232 g_strdelimit (str->str, "\r", '\n');
233
234 /* treat as paragraph breaks */
235 as_utils_string_replace (str, "<br>", "\n");
236
237 /* tidy up non-compliant HTML5 */
238 as_markup_import_html_erase (str, "<img", ">");
239 as_markup_import_html_erase (str, "<link", ">");
240 as_markup_import_html_erase (str, "<meta", ">");
241
242 /* use UTF-8 */
243 as_utils_string_replace (str, "™", "™");
244 as_utils_string_replace (str, "®", "®");
245 as_utils_string_replace (str, " ", " ");
246
247 /* parse */
248 if (!g_markup_parse_context_parse (ctx, str->str, -1, error))
249 return NULL;
250
251 /* return only valid AppStream markup */
252 return as_markup_convert_full (helper.output->str,
253 AS_MARKUP_CONVERT_FORMAT_APPSTREAM,
254 AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS,
255 error);
256 }
257
258 static gchar *
as_markup_import_simple(const gchar * text,GError ** error)259 as_markup_import_simple (const gchar *text, GError **error)
260 {
261 GString *str;
262 guint i;
263 g_auto(GStrv) lines = NULL;
264
265 /* empty */
266 if (text == NULL || text[0] == '\0')
267 return NULL;
268
269 /* just assume paragraphs */
270 str = g_string_new ("<p>");
271 lines = g_strsplit (text, "\n", -1);
272 for (i = 0; lines[i] != NULL; i++) {
273 g_autofree gchar *markup = NULL;
274 if (lines[i][0] == '\0') {
275 if (g_str_has_suffix (str->str, " "))
276 g_string_truncate (str, str->len - 1);
277 g_string_append (str, "</p><p>");
278 continue;
279 }
280 markup = g_markup_escape_text (lines[i], -1);
281 g_string_append (str, markup);
282 g_string_append (str, " ");
283 }
284 if (g_str_has_suffix (str->str, " "))
285 g_string_truncate (str, str->len - 1);
286 g_string_append (str, "</p>");
287 return g_string_free (str, FALSE);
288 }
289
290 /**
291 * as_markup_import:
292 * @text: the text to import.
293 * @format: the #AsMarkupConvertFormat, e.g. %AS_MARKUP_CONVERT_FORMAT_SIMPLE
294 * @error: A #GError or %NULL
295 *
296 * Imports text and converts to AppStream markup.
297 *
298 * Returns: (transfer full): appstream markup, or %NULL in event of an error
299 *
300 * Since: 0.5.11
301 */
302 gchar *
as_markup_import(const gchar * text,AsMarkupConvertFormat format,GError ** error)303 as_markup_import (const gchar *text, AsMarkupConvertFormat format, GError **error)
304 {
305 if (format == AS_MARKUP_CONVERT_FORMAT_SIMPLE)
306 return as_markup_import_simple (text, error);
307 if (format == AS_MARKUP_CONVERT_FORMAT_HTML)
308 return as_markup_import_html (text, error);
309 g_set_error_literal (error,
310 AS_UTILS_ERROR,
311 AS_UTILS_ERROR_INVALID_TYPE,
312 "unknown comnversion kind");
313 return NULL;
314 }
315
316 /**
317 * as_markup_strsplit_words:
318 * @text: the text to split.
319 * @line_len: the maximum length of the output line
320 *
321 * Splits up a long line into an array of smaller strings, each being no longer
322 * than @line_len. Words are not split.
323 *
324 * Returns: (transfer full): lines, or %NULL in event of an error
325 *
326 * Since: 0.3.5
327 **/
328 gchar **
as_markup_strsplit_words(const gchar * text,guint line_len)329 as_markup_strsplit_words (const gchar *text, guint line_len)
330 {
331 GPtrArray *lines;
332 guint i;
333 g_autoptr(GString) curline = NULL;
334 g_auto(GStrv) tokens = NULL;
335
336 /* sanity check */
337 if (text == NULL || text[0] == '\0')
338 return NULL;
339 if (line_len == 0)
340 return NULL;
341
342 lines = g_ptr_array_new ();
343 curline = g_string_new ("");
344
345 /* tokenize the string */
346 tokens = g_strsplit (text, " ", -1);
347 for (i = 0; tokens[i] != NULL; i++) {
348
349 /* current line plus new token is okay */
350 if (curline->len + strlen (tokens[i]) < line_len) {
351 g_string_append_printf (curline, "%s ", tokens[i]);
352 continue;
353 }
354
355 /* too long, so remove space, add newline and dump */
356 if (curline->len > 0)
357 g_string_truncate (curline, curline->len - 1);
358 g_string_append (curline, "\n");
359 g_ptr_array_add (lines, g_strdup (curline->str));
360 g_string_truncate (curline, 0);
361 g_string_append_printf (curline, "%s ", tokens[i]);
362
363 }
364
365 /* any incomplete line? */
366 if (curline->len > 0) {
367 g_string_truncate (curline, curline->len - 1);
368 g_string_append (curline, "\n");
369 g_ptr_array_add (lines, g_strdup (curline->str));
370 }
371
372 g_ptr_array_add (lines, NULL);
373 return (gchar **) g_ptr_array_free (lines, FALSE);
374 }
375
376 static void
as_markup_render_para(GString * str,AsMarkupConvertFormat format,const gchar * data)377 as_markup_render_para (GString *str, AsMarkupConvertFormat format, const gchar *data)
378 {
379 guint i;
380 g_autofree gchar *tmp = NULL;
381 g_auto(GStrv) spl = NULL;
382
383 /* ignore <p></p> */
384 if (data == NULL)
385 return;
386
387 if (str->len > 0)
388 g_string_append (str, "\n");
389 switch (format) {
390 case AS_MARKUP_CONVERT_FORMAT_SIMPLE:
391 g_string_append_printf (str, "%s\n", data);
392 break;
393 case AS_MARKUP_CONVERT_FORMAT_APPSTREAM:
394 tmp = g_markup_escape_text (data, -1);
395 g_string_append_printf (str, "<p>%s</p>", tmp);
396 break;
397 case AS_MARKUP_CONVERT_FORMAT_MARKDOWN:
398 /* break to 80 chars */
399 spl = as_markup_strsplit_words (data, 80);
400 for (i = 0; spl[i] != NULL; i++)
401 g_string_append (str, spl[i]);
402 break;
403 default:
404 break;
405 }
406 }
407
408 static void
as_markup_render_li(GString * str,AsMarkupConvertFormat format,const gchar * data)409 as_markup_render_li (GString *str, AsMarkupConvertFormat format, const gchar *data)
410 {
411 guint i;
412 g_autofree gchar *tmp = NULL;
413 g_auto(GStrv) spl = NULL;
414
415 switch (format) {
416 case AS_MARKUP_CONVERT_FORMAT_SIMPLE:
417 g_string_append_printf (str, " • %s\n", data);
418 break;
419 case AS_MARKUP_CONVERT_FORMAT_APPSTREAM:
420 tmp = g_markup_escape_text (data, -1);
421 g_string_append_printf (str, "<li>%s</li>", tmp);
422 break;
423 case AS_MARKUP_CONVERT_FORMAT_MARKDOWN:
424 /* break to 80 chars, leaving room for the dot/indent */
425 spl = as_markup_strsplit_words (data, 80 - 3);
426 g_string_append_printf (str, " * %s", spl[0]);
427 for (i = 1; spl[i] != NULL; i++)
428 g_string_append_printf (str, " %s", spl[i]);
429 break;
430 default:
431 break;
432 }
433 }
434
435 static void
as_markup_render_ul_start(GString * str,AsMarkupConvertFormat format)436 as_markup_render_ul_start (GString *str, AsMarkupConvertFormat format)
437 {
438 switch (format) {
439 case AS_MARKUP_CONVERT_FORMAT_APPSTREAM:
440 g_string_append (str, "<ul>");
441 break;
442 default:
443 break;
444 }
445 }
446
447 static void
as_markup_render_ul_end(GString * str,AsMarkupConvertFormat format)448 as_markup_render_ul_end (GString *str, AsMarkupConvertFormat format)
449 {
450 switch (format) {
451 case AS_MARKUP_CONVERT_FORMAT_APPSTREAM:
452 g_string_append (str, "</ul>");
453 break;
454 default:
455 break;
456 }
457 }
458
459 /**
460 * as_markup_validate:
461 * @markup: the text to validate
462 * @error: A #GError or %NULL
463 *
464 * Validates some markup.
465 *
466 * Returns: %TRUE if the appstream description was valid
467 *
468 * Since: 0.5.1
469 **/
470 gboolean
as_markup_validate(const gchar * markup,GError ** error)471 as_markup_validate (const gchar *markup, GError **error)
472 {
473 g_autofree gchar *tmp = NULL;
474 tmp = as_markup_convert (markup, AS_MARKUP_CONVERT_FORMAT_NULL, error);
475 return tmp != NULL;
476 }
477
478 /**
479 * as_markup_convert_full:
480 * @markup: the text to copy.
481 * @format: the #AsMarkupConvertFormat, e.g. %AS_MARKUP_CONVERT_FORMAT_MARKDOWN
482 * @flags: the #AsMarkupConvertFlag, e.g. %AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS
483 * @error: A #GError or %NULL
484 *
485 * Converts an XML description into a printable form.
486 *
487 * Returns: (transfer full): a newly allocated %NULL terminated string
488 *
489 * Since: 0.3.5
490 **/
491 gchar *
as_markup_convert_full(const gchar * markup,AsMarkupConvertFormat format,AsMarkupConvertFlag flags,GError ** error)492 as_markup_convert_full (const gchar *markup,
493 AsMarkupConvertFormat format,
494 AsMarkupConvertFlag flags,
495 GError **error)
496 {
497 GNode *tmp;
498 GNode *tmp_c;
499 const gchar *tag;
500 const gchar *tag_c;
501 g_autoptr(AsNode) root = NULL;
502 g_autoptr(GError) error_local = NULL;
503 g_autoptr(GString) str = NULL;
504
505 /* is this actually markup */
506 if (g_strstr_len (markup, -1, "<") == NULL)
507 return g_strdup (markup);
508
509 /* load */
510 root = as_node_from_xml (markup, AS_NODE_FROM_XML_FLAG_NONE, &error_local);
511 if (root == NULL) {
512
513 /* truncate to the last tag and try again */
514 if (flags & AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS) {
515 gchar *found;
516 g_autofree gchar *markup_new = NULL;
517 markup_new = g_strdup (markup);
518 found = g_strrstr (markup_new, "<");
519 g_assert (found != NULL);
520 *found = '\0';
521 return as_markup_convert_full (markup_new, format, flags, error);
522 }
523
524 /* just return error */
525 g_propagate_error (error, error_local);
526 error_local = NULL;
527 return NULL;
528 }
529
530 /* format */
531 str = g_string_new ("");
532 for (tmp = root->children; tmp != NULL; tmp = tmp->next) {
533
534 tag = as_node_get_name (tmp);
535 if (g_strcmp0 (tag, "unknown") == 0)
536 continue;
537 if (g_strcmp0 (tag, "p") == 0) {
538 as_markup_render_para (str, format, as_node_get_data (tmp));
539 continue;
540 }
541
542 /* loop on the children */
543 if (g_strcmp0 (tag, "ul") == 0 ||
544 g_strcmp0 (tag, "ol") == 0) {
545 as_markup_render_ul_start (str, format);
546 for (tmp_c = tmp->children; tmp_c != NULL; tmp_c = tmp_c->next) {
547 tag_c = as_node_get_name (tmp_c);
548 if (g_strcmp0 (tag_c, "unknown") == 0)
549 continue;
550 if (g_strcmp0 (tag_c, "li") == 0) {
551 as_markup_render_li (str, format,
552 as_node_get_data (tmp_c));
553 continue;
554 }
555
556 /* just abort the list */
557 if (flags & AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS)
558 break;
559
560 /* only <li> is valid in lists */
561 g_set_error (error,
562 AS_NODE_ERROR,
563 AS_NODE_ERROR_FAILED,
564 "Tag %s in %s invalid",
565 tag_c, tag);
566 return NULL;
567 }
568 as_markup_render_ul_end (str, format);
569 continue;
570 }
571
572 /* just try again */
573 if (flags & AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS)
574 continue;
575
576 /* only <p>, <ul> and <ol> is valid here */
577 g_set_error (error,
578 AS_NODE_ERROR,
579 AS_NODE_ERROR_FAILED,
580 "Unknown tag '%s'", tag);
581 return NULL;
582 }
583
584 /* success */
585 switch (format) {
586 case AS_MARKUP_CONVERT_FORMAT_SIMPLE:
587 case AS_MARKUP_CONVERT_FORMAT_MARKDOWN:
588 if (str->len > 0)
589 g_string_truncate (str, str->len - 1);
590 break;
591 default:
592 break;
593 }
594 return g_strdup (str->str);
595 }
596
597 /**
598 * as_markup_convert:
599 * @markup: the text to copy.
600 * @format: the #AsMarkupConvertFormat, e.g. %AS_MARKUP_CONVERT_FORMAT_MARKDOWN
601 * @error: A #GError or %NULL
602 *
603 * Converts an XML description into a printable form.
604 *
605 * Returns: (transfer full): a newly allocated %NULL terminated string
606 *
607 * Since: 0.3.5
608 **/
609 gchar *
as_markup_convert(const gchar * markup,AsMarkupConvertFormat format,GError ** error)610 as_markup_convert (const gchar *markup,
611 AsMarkupConvertFormat format, GError **error)
612 {
613 return as_markup_convert_full (markup, format,
614 AS_MARKUP_CONVERT_FLAG_NONE,
615 error);
616 }
617
618 /**
619 * as_markup_convert_simple:
620 * @markup: the text to copy.
621 * @error: A #GError or %NULL
622 *
623 * Converts an XML description into a printable form.
624 *
625 * Returns: (transfer full): a newly allocated %NULL terminated string
626 *
627 * Since: 0.1.0
628 **/
629 gchar *
as_markup_convert_simple(const gchar * markup,GError ** error)630 as_markup_convert_simple (const gchar *markup, GError **error)
631 {
632 return as_markup_convert_full (markup,
633 AS_MARKUP_CONVERT_FORMAT_SIMPLE,
634 AS_MARKUP_CONVERT_FLAG_NONE,
635 error);
636 }
637