1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
2  *
3  * Copyright (C) 2014-2016 Richard Hughes <richard@hughsie.com>
4  *
5  * SPDX-License-Identifier: LGPL-2.1+
6  */
7 
8 /**
9  * SECTION:as-markup
10  * @short_description: Functions for managing AppStream description markup
11  * @include: appstream-glib.h
12  * @stability: Stable
13  *
14  * These functions are used internally to libappstream-glib, and some may be
15  * useful to user-applications.
16  */
17 
18 #include "config.h"
19 
20 #include <string.h>
21 
22 #include "as-markup.h"
23 #include "as-node.h"
24 #include "as-utils.h"
25 
26 typedef enum {
27 	AS_MARKUP_TAG_UNKNOWN,
28 	AS_MARKUP_TAG_PARA,
29 	AS_MARKUP_TAG_OL,
30 	AS_MARKUP_TAG_UL,
31 	AS_MARKUP_TAG_LI,
32 	AS_MARKUP_TAG_LAST
33 } AsMarkupTag;
34 
35 typedef struct {
36 	AsMarkupTag	 action;
37 	GString		*output;
38 	GString		*temp;
39 } AsMarkupImportHelper;
40 
41 static void
as_markup_import_html_flush(AsMarkupImportHelper * helper)42 as_markup_import_html_flush (AsMarkupImportHelper *helper)
43 {
44 	gchar *tmp;
45 	guint i;
46 	g_auto(GStrv) split = NULL;
47 
48 	/* trivial case */
49 	if (helper->action == AS_MARKUP_TAG_UNKNOWN)
50 		return;
51 	if (helper->temp->len == 0)
52 		return;
53 
54 	/* split into lines and strip */
55 	split = g_strsplit (helper->temp->str, "\n", -1);
56 	for (i = 0; split[i] != NULL; i++) {
57 		tmp = g_strstrip (split[i]);
58 		if (tmp[0] == '\0')
59 			continue;
60 		switch (helper->action) {
61 		case AS_MARKUP_TAG_PARA:
62 			g_string_append_printf (helper->output, "<p>%s</p>", tmp);
63 			break;
64 		case AS_MARKUP_TAG_LI:
65 			g_string_append_printf (helper->output, "<li>%s</li>", tmp);
66 			break;
67 		default:
68 			break;
69 		}
70 	}
71 	g_string_truncate (helper->temp, 0);
72 }
73 
74 static void
as_markup_import_html_set_tag(AsMarkupImportHelper * helper,AsMarkupTag action_new)75 as_markup_import_html_set_tag (AsMarkupImportHelper *helper, AsMarkupTag action_new)
76 {
77 	if (helper->action == AS_MARKUP_TAG_UL &&
78 	    action_new == AS_MARKUP_TAG_LI) {
79 		g_string_append (helper->output, "<ul>");
80 		helper->action = action_new;
81 	} else if (helper->action == AS_MARKUP_TAG_UL &&
82 		   action_new == AS_MARKUP_TAG_UNKNOWN) {
83 		g_string_append (helper->output, "</ul>");
84 		helper->action = action_new;
85 	} else {
86 		helper->action = action_new;
87 	}
88 }
89 
90 static void
as_markup_import_html_start_cb(GMarkupParseContext * context,const gchar * element_name,const gchar ** attribute_names,const gchar ** attribute_values,gpointer user_data,GError ** error)91 as_markup_import_html_start_cb (GMarkupParseContext *context,
92 				const gchar *element_name,
93 				const gchar **attribute_names,
94 				const gchar **attribute_values,
95 				gpointer user_data,
96 				GError **error)
97 {
98 	AsMarkupImportHelper *helper = (AsMarkupImportHelper *) user_data;
99 
100 	/* don't assume the document starts with <p> */
101 	if (g_strcmp0 (element_name, "document") == 0 ||
102 	    g_strcmp0 (element_name, "p") == 0) {
103 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_PARA);
104 		return;
105 	}
106 	if (g_strcmp0 (element_name, "li") == 0) {
107 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_LI);
108 		return;
109 	}
110 	if (g_strcmp0 (element_name, "ul") == 0) {
111 		as_markup_import_html_flush (helper);
112 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UL);
113 		return;
114 	}
115 
116 	/* do not include the contents of these tags */
117 	if (g_strcmp0 (element_name, "h1") == 0 ||
118 	    g_strcmp0 (element_name, "h2") == 0) {
119 		as_markup_import_html_flush (helper);
120 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UNKNOWN);
121 		return;
122 	}
123 }
124 
125 static void
as_markup_import_html_end_cb(GMarkupParseContext * context,const gchar * element_name,gpointer user_data,GError ** error)126 as_markup_import_html_end_cb (GMarkupParseContext *context,
127 			      const gchar *element_name,
128 			      gpointer user_data,
129 			      GError **error)
130 {
131 	AsMarkupImportHelper *helper = (AsMarkupImportHelper *) user_data;
132 
133 	if (g_strcmp0 (element_name, "document") == 0 ||
134 	    g_strcmp0 (element_name, "p") == 0) {
135 		as_markup_import_html_flush (helper);
136 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UNKNOWN);
137 		return;
138 	}
139 	/* don't assume the next section starts with <p> */
140 	if (g_strcmp0 (element_name, "h1") == 0 ||
141 	    g_strcmp0 (element_name, "h2") == 0) {
142 		as_markup_import_html_flush (helper);
143 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_PARA);
144 		return;
145 	}
146 	if (g_strcmp0 (element_name, "li") == 0) {
147 		as_markup_import_html_flush (helper);
148 		/* not UL, else we do a new <ul> on next <li> */
149 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_LI);
150 		return;
151 	}
152 	if (g_strcmp0 (element_name, "ul") == 0 ||
153 	    g_strcmp0 (element_name, "ol") == 0) {
154 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UL);
155 		as_markup_import_html_set_tag (helper, AS_MARKUP_TAG_UNKNOWN);
156 		return;
157 	}
158 }
159 
160 static void
as_markup_import_html_text_cb(GMarkupParseContext * context,const gchar * text,gsize text_len,gpointer user_data,GError ** error)161 as_markup_import_html_text_cb (GMarkupParseContext *context,
162 			       const gchar *text,
163 			       gsize text_len,
164 			       gpointer user_data,
165 			       GError **error)
166 {
167 	AsMarkupImportHelper *helper = (AsMarkupImportHelper *) user_data;
168 	g_autofree gchar *tmp = NULL;
169 
170 	if (helper->action == AS_MARKUP_TAG_UNKNOWN)
171 		return;
172 
173 	/* do not assume text is NULL-terminated */
174 	tmp = g_strndup (text, text_len);
175 	g_string_append (helper->temp, tmp);
176 }
177 
178 static void
as_markup_import_html_erase(GString * str,const gchar * start,const gchar * end)179 as_markup_import_html_erase (GString *str, const gchar *start, const gchar *end)
180 {
181 	gssize start_len = (gssize) strlen (start);
182 	gssize end_len = (gssize) strlen (end);
183 	if (start_len + end_len > str->len)
184 		return;
185 	for (gssize i = 0; i < str->len - start_len; i++) {
186 		if (memcmp (&str->str[i], start, (gsize) start_len) != 0)
187 			continue;
188 		for (gssize j = i; i < (gssize) str->len; j++) {
189 			if (memcmp (&str->str[j], end, (gsize) end_len) != 0)
190 				continue;
191 			/* delete this section and restart the search */
192 			g_string_erase (str, i, (j - i) + end_len);
193 			i = -1;
194 			break;
195 		}
196 	}
197 }
198 
199 static gchar *
as_markup_import_html(const gchar * text,GError ** error)200 as_markup_import_html (const gchar *text, GError **error)
201 {
202 	AsMarkupImportHelper helper;
203 	GMarkupParser parser = {
204 		as_markup_import_html_start_cb,
205 		as_markup_import_html_end_cb,
206 		as_markup_import_html_text_cb,
207 		NULL,
208 		NULL };
209 	g_autoptr(GMarkupParseContext) ctx = NULL;
210 	g_autoptr(GString) str = NULL;
211 	g_autoptr(GString) helper_output = NULL;
212 	g_autoptr(GString) helper_temp = NULL;
213 
214 	/* clean these up on failure */
215 	helper_output = g_string_new ("");
216 	helper_temp = g_string_new ("");
217 
218 	/* set up XML parser */
219 	helper.action = AS_MARKUP_TAG_UNKNOWN;
220 	helper.output = helper_output;
221 	helper.temp = helper_temp;
222 	ctx = g_markup_parse_context_new (&parser,
223 					  G_MARKUP_TREAT_CDATA_AS_TEXT,
224 					  &helper,
225 					  NULL);
226 
227 	/* ensure this has at least one set of quotes */
228 	str = g_string_new ("");
229 	g_string_append_printf (str, "<document>%s</document>", text);
230 
231 	/* convert win32 line endings */
232 	g_strdelimit (str->str, "\r", '\n');
233 
234 	/* treat as paragraph breaks */
235 	as_utils_string_replace (str, "<br>", "\n");
236 
237 	/* tidy up non-compliant HTML5 */
238 	as_markup_import_html_erase (str, "<img", ">");
239 	as_markup_import_html_erase (str, "<link", ">");
240 	as_markup_import_html_erase (str, "<meta", ">");
241 
242 	/* use UTF-8 */
243 	as_utils_string_replace (str, "&trade;", "™");
244 	as_utils_string_replace (str, "&reg;", "®");
245 	as_utils_string_replace (str, "&nbsp;", " ");
246 
247 	/* parse */
248 	if (!g_markup_parse_context_parse (ctx, str->str, -1, error))
249 		return NULL;
250 
251 	/* return only valid AppStream markup */
252 	return as_markup_convert_full (helper.output->str,
253 				       AS_MARKUP_CONVERT_FORMAT_APPSTREAM,
254 				       AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS,
255 				       error);
256 }
257 
258 static gchar *
as_markup_import_simple(const gchar * text,GError ** error)259 as_markup_import_simple (const gchar *text, GError **error)
260 {
261 	GString *str;
262 	guint i;
263 	g_auto(GStrv) lines = NULL;
264 
265 	/* empty */
266 	if (text == NULL || text[0] == '\0')
267 		return NULL;
268 
269 	/* just assume paragraphs */
270 	str = g_string_new ("<p>");
271 	lines = g_strsplit (text, "\n", -1);
272 	for (i = 0; lines[i] != NULL; i++) {
273 		g_autofree gchar *markup = NULL;
274 		if (lines[i][0] == '\0') {
275 			if (g_str_has_suffix (str->str, " "))
276 				g_string_truncate (str, str->len - 1);
277 			g_string_append (str, "</p><p>");
278 			continue;
279 		}
280 		markup = g_markup_escape_text (lines[i], -1);
281 		g_string_append (str, markup);
282 		g_string_append (str, " ");
283 	}
284 	if (g_str_has_suffix (str->str, " "))
285 		g_string_truncate (str, str->len - 1);
286 	g_string_append (str, "</p>");
287 	return g_string_free (str, FALSE);
288 }
289 
290 /**
291  * as_markup_import:
292  * @text: the text to import.
293  * @format: the #AsMarkupConvertFormat, e.g. %AS_MARKUP_CONVERT_FORMAT_SIMPLE
294  * @error: A #GError or %NULL
295  *
296  * Imports text and converts to AppStream markup.
297  *
298  * Returns: (transfer full): appstream markup, or %NULL in event of an error
299  *
300  * Since: 0.5.11
301  */
302 gchar *
as_markup_import(const gchar * text,AsMarkupConvertFormat format,GError ** error)303 as_markup_import (const gchar *text, AsMarkupConvertFormat format, GError **error)
304 {
305 	if (format == AS_MARKUP_CONVERT_FORMAT_SIMPLE)
306 		return as_markup_import_simple (text, error);
307 	if (format == AS_MARKUP_CONVERT_FORMAT_HTML)
308 		return as_markup_import_html (text, error);
309 	g_set_error_literal (error,
310 			     AS_UTILS_ERROR,
311 			     AS_UTILS_ERROR_INVALID_TYPE,
312 			     "unknown comnversion kind");
313 	return NULL;
314 }
315 
316 /**
317  * as_markup_strsplit_words:
318  * @text: the text to split.
319  * @line_len: the maximum length of the output line
320  *
321  * Splits up a long line into an array of smaller strings, each being no longer
322  * than @line_len. Words are not split.
323  *
324  * Returns: (transfer full): lines, or %NULL in event of an error
325  *
326  * Since: 0.3.5
327  **/
328 gchar **
as_markup_strsplit_words(const gchar * text,guint line_len)329 as_markup_strsplit_words (const gchar *text, guint line_len)
330 {
331 	GPtrArray *lines;
332 	guint i;
333 	g_autoptr(GString) curline = NULL;
334 	g_auto(GStrv) tokens = NULL;
335 
336 	/* sanity check */
337 	if (text == NULL || text[0] == '\0')
338 		return NULL;
339 	if (line_len == 0)
340 		return NULL;
341 
342 	lines = g_ptr_array_new ();
343 	curline = g_string_new ("");
344 
345 	/* tokenize the string */
346 	tokens = g_strsplit (text, " ", -1);
347 	for (i = 0; tokens[i] != NULL; i++) {
348 
349 		/* current line plus new token is okay */
350 		if (curline->len + strlen (tokens[i]) < line_len) {
351 			g_string_append_printf (curline, "%s ", tokens[i]);
352 			continue;
353 		}
354 
355 		/* too long, so remove space, add newline and dump */
356 		if (curline->len > 0)
357 			g_string_truncate (curline, curline->len - 1);
358 		g_string_append (curline, "\n");
359 		g_ptr_array_add (lines, g_strdup (curline->str));
360 		g_string_truncate (curline, 0);
361 		g_string_append_printf (curline, "%s ", tokens[i]);
362 
363 	}
364 
365 	/* any incomplete line? */
366 	if (curline->len > 0) {
367 		g_string_truncate (curline, curline->len - 1);
368 		g_string_append (curline, "\n");
369 		g_ptr_array_add (lines, g_strdup (curline->str));
370 	}
371 
372 	g_ptr_array_add (lines, NULL);
373 	return (gchar **) g_ptr_array_free (lines, FALSE);
374 }
375 
376 static void
as_markup_render_para(GString * str,AsMarkupConvertFormat format,const gchar * data)377 as_markup_render_para (GString *str, AsMarkupConvertFormat format, const gchar *data)
378 {
379 	guint i;
380 	g_autofree gchar *tmp = NULL;
381 	g_auto(GStrv) spl = NULL;
382 
383 	/* ignore <p></p> */
384 	if (data == NULL)
385 		return;
386 
387 	if (str->len > 0)
388 		g_string_append (str, "\n");
389 	switch (format) {
390 	case AS_MARKUP_CONVERT_FORMAT_SIMPLE:
391 		g_string_append_printf (str, "%s\n", data);
392 		break;
393 	case AS_MARKUP_CONVERT_FORMAT_APPSTREAM:
394 		tmp = g_markup_escape_text (data, -1);
395 		g_string_append_printf (str, "<p>%s</p>", tmp);
396 		break;
397 	case AS_MARKUP_CONVERT_FORMAT_MARKDOWN:
398 		/* break to 80 chars */
399 		spl = as_markup_strsplit_words (data, 80);
400 		for (i = 0; spl[i] != NULL; i++)
401 			g_string_append (str, spl[i]);
402 		break;
403 	default:
404 		break;
405 	}
406 }
407 
408 static void
as_markup_render_li(GString * str,AsMarkupConvertFormat format,const gchar * data)409 as_markup_render_li (GString *str, AsMarkupConvertFormat format, const gchar *data)
410 {
411 	guint i;
412 	g_autofree gchar *tmp = NULL;
413 	g_auto(GStrv) spl = NULL;
414 
415 	switch (format) {
416 	case AS_MARKUP_CONVERT_FORMAT_SIMPLE:
417 		g_string_append_printf (str, " • %s\n", data);
418 		break;
419 	case AS_MARKUP_CONVERT_FORMAT_APPSTREAM:
420 		tmp = g_markup_escape_text (data, -1);
421 		g_string_append_printf (str, "<li>%s</li>", tmp);
422 		break;
423 	case AS_MARKUP_CONVERT_FORMAT_MARKDOWN:
424 		/* break to 80 chars, leaving room for the dot/indent */
425 		spl = as_markup_strsplit_words (data, 80 - 3);
426 		g_string_append_printf (str, " * %s", spl[0]);
427 		for (i = 1; spl[i] != NULL; i++)
428 			g_string_append_printf (str, "   %s", spl[i]);
429 		break;
430 	default:
431 		break;
432 	}
433 }
434 
435 static void
as_markup_render_ul_start(GString * str,AsMarkupConvertFormat format)436 as_markup_render_ul_start (GString *str, AsMarkupConvertFormat format)
437 {
438 	switch (format) {
439 	case AS_MARKUP_CONVERT_FORMAT_APPSTREAM:
440 		g_string_append (str, "<ul>");
441 		break;
442 	default:
443 		break;
444 	}
445 }
446 
447 static void
as_markup_render_ul_end(GString * str,AsMarkupConvertFormat format)448 as_markup_render_ul_end (GString *str, AsMarkupConvertFormat format)
449 {
450 	switch (format) {
451 	case AS_MARKUP_CONVERT_FORMAT_APPSTREAM:
452 		g_string_append (str, "</ul>");
453 		break;
454 	default:
455 		break;
456 	}
457 }
458 
459 /**
460  * as_markup_validate:
461  * @markup: the text to validate
462  * @error: A #GError or %NULL
463  *
464  * Validates some markup.
465  *
466  * Returns: %TRUE if the appstream description was valid
467  *
468  * Since: 0.5.1
469  **/
470 gboolean
as_markup_validate(const gchar * markup,GError ** error)471 as_markup_validate (const gchar *markup, GError **error)
472 {
473 	g_autofree gchar *tmp = NULL;
474 	tmp = as_markup_convert (markup, AS_MARKUP_CONVERT_FORMAT_NULL, error);
475 	return tmp != NULL;
476 }
477 
478 /**
479  * as_markup_convert_full:
480  * @markup: the text to copy.
481  * @format: the #AsMarkupConvertFormat, e.g. %AS_MARKUP_CONVERT_FORMAT_MARKDOWN
482  * @flags: the #AsMarkupConvertFlag, e.g. %AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS
483  * @error: A #GError or %NULL
484  *
485  * Converts an XML description into a printable form.
486  *
487  * Returns: (transfer full): a newly allocated %NULL terminated string
488  *
489  * Since: 0.3.5
490  **/
491 gchar *
as_markup_convert_full(const gchar * markup,AsMarkupConvertFormat format,AsMarkupConvertFlag flags,GError ** error)492 as_markup_convert_full (const gchar *markup,
493 			AsMarkupConvertFormat format,
494 			AsMarkupConvertFlag flags,
495 			GError **error)
496 {
497 	GNode *tmp;
498 	GNode *tmp_c;
499 	const gchar *tag;
500 	const gchar *tag_c;
501 	g_autoptr(AsNode) root = NULL;
502 	g_autoptr(GError) error_local = NULL;
503 	g_autoptr(GString) str = NULL;
504 
505 	/* is this actually markup */
506 	if (g_strstr_len (markup, -1, "<") == NULL)
507 		return g_strdup (markup);
508 
509 	/* load */
510 	root = as_node_from_xml (markup, AS_NODE_FROM_XML_FLAG_NONE, &error_local);
511 	if (root == NULL) {
512 
513 		/* truncate to the last tag and try again */
514 		if (flags & AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS) {
515 			gchar *found;
516 			g_autofree gchar *markup_new = NULL;
517 			markup_new = g_strdup (markup);
518 			found = g_strrstr (markup_new, "<");
519 			g_assert (found != NULL);
520 			*found = '\0';
521 			return as_markup_convert_full (markup_new, format, flags, error);
522 		}
523 
524 		/* just return error */
525 		g_propagate_error (error, error_local);
526 		error_local = NULL;
527 		return NULL;
528 	}
529 
530 	/* format */
531 	str = g_string_new ("");
532 	for (tmp = root->children; tmp != NULL; tmp = tmp->next) {
533 
534 		tag = as_node_get_name (tmp);
535 		if (g_strcmp0 (tag, "unknown") == 0)
536 			continue;
537 		if (g_strcmp0 (tag, "p") == 0) {
538 			as_markup_render_para (str, format, as_node_get_data (tmp));
539 			continue;
540 		}
541 
542 		/* loop on the children */
543 		if (g_strcmp0 (tag, "ul") == 0 ||
544 		    g_strcmp0 (tag, "ol") == 0) {
545 			as_markup_render_ul_start (str, format);
546 			for (tmp_c = tmp->children; tmp_c != NULL; tmp_c = tmp_c->next) {
547 				tag_c = as_node_get_name (tmp_c);
548 				if (g_strcmp0 (tag_c, "unknown") == 0)
549 					continue;
550 				if (g_strcmp0 (tag_c, "li") == 0) {
551 					as_markup_render_li (str, format,
552 							     as_node_get_data (tmp_c));
553 					continue;
554 				}
555 
556 				/* just abort the list */
557 				if (flags & AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS)
558 					break;
559 
560 				/* only <li> is valid in lists */
561 				g_set_error (error,
562 					     AS_NODE_ERROR,
563 					     AS_NODE_ERROR_FAILED,
564 					     "Tag %s in %s invalid",
565 					     tag_c, tag);
566 				return NULL;
567 			}
568 			as_markup_render_ul_end (str, format);
569 			continue;
570 		}
571 
572 		/* just try again */
573 		if (flags & AS_MARKUP_CONVERT_FLAG_IGNORE_ERRORS)
574 			continue;
575 
576 		/* only <p>, <ul> and <ol> is valid here */
577 		g_set_error (error,
578 			     AS_NODE_ERROR,
579 			     AS_NODE_ERROR_FAILED,
580 			     "Unknown tag '%s'", tag);
581 		return NULL;
582 	}
583 
584 	/* success */
585 	switch (format) {
586 	case AS_MARKUP_CONVERT_FORMAT_SIMPLE:
587 	case AS_MARKUP_CONVERT_FORMAT_MARKDOWN:
588 		if (str->len > 0)
589 			g_string_truncate (str, str->len - 1);
590 		break;
591 	default:
592 		break;
593 	}
594 	return g_strdup (str->str);
595 }
596 
597 /**
598  * as_markup_convert:
599  * @markup: the text to copy.
600  * @format: the #AsMarkupConvertFormat, e.g. %AS_MARKUP_CONVERT_FORMAT_MARKDOWN
601  * @error: A #GError or %NULL
602  *
603  * Converts an XML description into a printable form.
604  *
605  * Returns: (transfer full): a newly allocated %NULL terminated string
606  *
607  * Since: 0.3.5
608  **/
609 gchar *
as_markup_convert(const gchar * markup,AsMarkupConvertFormat format,GError ** error)610 as_markup_convert (const gchar *markup,
611 		   AsMarkupConvertFormat format, GError **error)
612 {
613 	return as_markup_convert_full (markup, format,
614 				       AS_MARKUP_CONVERT_FLAG_NONE,
615 				       error);
616 }
617 
618 /**
619  * as_markup_convert_simple:
620  * @markup: the text to copy.
621  * @error: A #GError or %NULL
622  *
623  * Converts an XML description into a printable form.
624  *
625  * Returns: (transfer full): a newly allocated %NULL terminated string
626  *
627  * Since: 0.1.0
628  **/
629 gchar *
as_markup_convert_simple(const gchar * markup,GError ** error)630 as_markup_convert_simple (const gchar *markup, GError **error)
631 {
632 	return as_markup_convert_full (markup,
633 				       AS_MARKUP_CONVERT_FORMAT_SIMPLE,
634 				       AS_MARKUP_CONVERT_FLAG_NONE,
635 				       error);
636 }
637