1 /* copyright 2013 Sascha Kruse and contributors (see LICENSE for licensing information) */
2 
3 #include "markup.h"
4 
5 #include <assert.h>
6 #include <ctype.h>
7 #include <stdbool.h>
8 #include <stdio.h>
9 #include <string.h>
10 
11 #include "log.h"
12 #include "settings.h"
13 #include "utils.h"
14 
15 /**
16  * Convert all HTML special symbols to HTML entities.
17  * @param str (nullable)
18  */
markup_quote(char * str)19 static char *markup_quote(char *str)
20 {
21         ASSERT_OR_RET(str, NULL);
22 
23         str = string_replace_all("&", "&amp;", str);
24         str = string_replace_all("\"", "&quot;", str);
25         str = string_replace_all("'", "&apos;", str);
26         str = string_replace_all("<", "&lt;", str);
27         str = string_replace_all(">", "&gt;", str);
28 
29         return str;
30 }
31 
32 /**
33  * Convert all HTML special entities to their actual char.
34  * @param str (nullable)
35  */
markup_unquote(char * str)36 static char *markup_unquote(char *str)
37 {
38         ASSERT_OR_RET(str, NULL);
39 
40         str = string_replace_all("&quot;", "\"", str);
41         str = string_replace_all("&apos;", "'", str);
42         str = string_replace_all("&lt;", "<", str);
43         str = string_replace_all("&gt;", ">", str);
44         str = string_replace_all("&amp;", "&", str);
45 
46         return str;
47 }
48 
49 /**
50  * Convert all HTML linebreak tags to a newline character
51  * @param str (nullable)
52  */
markup_br2nl(char * str)53 static char *markup_br2nl(char *str)
54 {
55         ASSERT_OR_RET(str, NULL);
56 
57         str = string_replace_all("<br>", "\n", str);
58         str = string_replace_all("<br/>", "\n", str);
59         str = string_replace_all("<br />", "\n", str);
60         return str;
61 }
62 
63 /* see markup.h */
markup_strip_a(char ** str,char ** urls)64 void markup_strip_a(char **str, char **urls)
65 {
66         assert(*str);
67         char *tag1 = NULL;
68 
69         if (urls)
70                 *urls = NULL;
71 
72         while ((tag1 = strstr(*str, "<a"))) {
73                 // use href=" as stated in the notification spec
74                 char *href = strstr(tag1, "href=\"");
75                 char *tag1_end = strstr(tag1, ">");
76                 char *tag2 = strstr(tag1, "</a>");
77 
78                 // the tag is broken, ignore it
79                 if (!tag1_end) {
80                         LOG_W("Given link is broken: '%s'",
81                               tag1);
82                         string_replace_at(*str, tag1-*str, strlen(tag1), "");
83                         break;
84                 }
85                 if (tag2 && tag2 < tag1_end) {
86                         int repl_len =  (tag2 - tag1) + strlen("</a>");
87                         LOG_W("Given link is broken: '%.*s.'",
88                               repl_len, tag1);
89                         string_replace_at(*str, tag1-*str, repl_len, "");
90                         break;
91                 }
92 
93                 // search contents of href attribute
94                 char *plain_url = NULL;
95                 if (href && href < tag1_end) {
96 
97                         // shift href to the actual begin of the value
98                         href = href+6;
99 
100                         const char *quote = strstr(href, "\"");
101 
102                         if (quote && quote < tag1_end) {
103                                 plain_url = g_strndup(href, quote-href);
104                         }
105                 }
106 
107                 // text between a tags
108                 int text_len;
109                 if (tag2)
110                         text_len = tag2 - (tag1_end+1);
111                 else
112                         text_len = strlen(tag1_end+1);
113 
114                 char *text = g_strndup(tag1_end+1, text_len);
115 
116                 int repl_len = text_len + (tag1_end-tag1) + 1;
117                 repl_len += tag2 ? strlen("</a>") : 0;
118 
119                 *str = string_replace_at(*str, tag1-*str, repl_len, text);
120 
121                 // if there had been a href attribute,
122                 // add it to the URLs
123                 if (plain_url && urls) {
124                         text = string_replace_all("]", "", text);
125                         text = string_replace_all("[", "", text);
126 
127                         char *url = g_strdup_printf("[%s] %s", text, plain_url);
128 
129                         *urls = string_append(*urls, url, "\n");
130                         g_free(url);
131                 }
132 
133                 g_free(plain_url);
134                 g_free(text);
135         }
136 }
137 
138 /* see markup.h */
markup_strip_img(char ** str,char ** urls)139 void markup_strip_img(char **str, char **urls)
140 {
141         const char *start;
142 
143         if (urls)
144                 *urls = NULL;
145 
146         while ((start = strstr(*str, "<img"))) {
147                 const char *end = strstr(start, ">");
148 
149                 // the tag is broken, ignore it
150                 if (!end) {
151                         LOG_W("Given image is broken: '%s'", start);
152                         string_replace_at(*str, start-*str, strlen(start), "");
153                         break;
154                 }
155 
156                 // use attribute=" as stated in the notification spec
157                 const char *alt_s = strstr(start, "alt=\"");
158                 const char *src_s = strstr(start, "src=\"");
159 
160                 char *text_alt = NULL;
161                 char *text_src = NULL;
162 
163                 const char *src_e = NULL, *alt_e = NULL;
164                 if (alt_s)
165                         alt_e = strstr(alt_s + strlen("alt=\""), "\"");
166                 if (src_s)
167                         src_e = strstr(src_s + strlen("src=\""), "\"");
168 
169                 // Move pointer to the actual start
170                 alt_s = alt_s ? alt_s + strlen("alt=\"") : NULL;
171                 src_s = src_s ? src_s + strlen("src=\"") : NULL;
172 
173                 /* check if alt and src attribute are given
174                  * If both given, check the alignment of all pointers */
175                 if (   alt_s && alt_e
176                     && src_s && src_e
177                     && (  (alt_s < src_s && alt_e < src_s-strlen("src=\"") && src_e < end)
178                         ||(src_s < alt_s && src_e < alt_s-strlen("alt=\"") && alt_e < end)) ) {
179 
180                         text_alt = g_strndup(alt_s, alt_e-alt_s);
181                         text_src = g_strndup(src_s, src_e-src_s);
182 
183                 /* check if single valid alt attribute is available */
184                 } else if (alt_s && alt_e && alt_e < end && (!src_s || src_s < alt_s || alt_e < src_s - strlen("src=\""))) {
185                         text_alt = g_strndup(alt_s, alt_e-alt_s);
186 
187                 /* check if single valid src attribute is available */
188                 } else if (src_s && src_e && src_e < end && (!alt_s || alt_s < src_s || src_e < alt_s - strlen("alt=\""))) {
189                         text_src = g_strndup(src_s, src_e-src_s);
190 
191                 } else {
192                          LOG_W("Given image argument is broken: '%.*s'",
193                                (int)(end-start), start);
194                 }
195 
196                 // replacement text for alt
197                 int repl_len = end - start + 1;
198 
199                 if (!text_alt)
200                         text_alt = g_strdup("[image]");
201 
202                 *str = string_replace_at(*str, start-*str, repl_len, text_alt);
203 
204                 // if there had been a href attribute,
205                 // add it to the URLs
206                 if (text_src && urls) {
207                         text_alt = string_replace_all("]", "", text_alt);
208                         text_alt = string_replace_all("[", "", text_alt);
209 
210                         char *url = g_strdup_printf("[%s] %s", text_alt, text_src);
211 
212                         *urls = string_append(*urls, url, "\n");
213                         g_free(url);
214                 }
215 
216                 g_free(text_src);
217                 g_free(text_alt);
218         }
219 }
220 
221 /* see markup.h */
markup_strip(char * str)222 char *markup_strip(char *str)
223 {
224         ASSERT_OR_RET(str, NULL);
225 
226         /* strip all tags */
227         string_strip_delimited(str, '<', '>');
228 
229         /* unquote the remainder */
230         str = markup_unquote(str);
231 
232         return str;
233 }
234 
235 /**
236  * Determine if an & character pointed to by \p str is a markup & entity or
237  * part of the text
238  *
239  * @retval true: \p str is an entity
240  * @retval false: It's no valid entity
241  */
markup_is_entity(const char * str)242 static bool markup_is_entity(const char *str)
243 {
244         assert(str);
245         assert(*str == '&');
246 
247         char *end = strchr(str, ';');
248         ASSERT_OR_RET(end, false);
249 
250         // Parse (hexa)decimal entities with the format &#1234; or &#xABC;
251         if (str[1] == '#') {
252                 const char *cur = str + 2;
253 
254                 if (*cur == 'x') {
255                         cur++;
256 
257                         // Reject &#x;
258                         if (*cur == ';')
259                                 return false;
260 
261                         while (isxdigit(*cur) && cur < end)
262                                 cur++;
263                 } else {
264 
265                         // Reject &#;
266                         if (*cur == ';')
267                                 return false;
268 
269                         while (isdigit(*cur) && cur < end)
270                                 cur++;
271                 }
272 
273                 return (cur == end);
274         } else {
275                 const char *supported_tags[] = {"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};
276                 for (int i = 0; i < sizeof(supported_tags)/sizeof(*supported_tags); i++) {
277                         if (g_str_has_prefix(str, supported_tags[i]))
278                                 return true;
279                 }
280                 return false;
281         }
282 }
283 
284 /**
285  * Escape all unsupported and invalid &-entities in a string. If the resulting
286  * string does not fit it will be reallocated.
287  *
288  * @param str The string to be transformed
289  */
markup_escape_unsupported(char * str)290 static char *markup_escape_unsupported(char *str)
291 {
292         ASSERT_OR_RET(str, NULL);
293 
294         char *match = str;
295         while ((match = strchr(match, '&'))) {
296                 if (!markup_is_entity(match)) {
297                         int pos = match - str;
298                         str = string_replace_at(str, pos, 1, "&amp;");
299                         match = str + pos + strlen("&amp;");
300                 } else {
301                         match++;
302                 }
303         }
304 
305         return str;
306 }
307 
308 /* see markup.h */
markup_transform(char * str,enum markup_mode markup_mode)309 char *markup_transform(char *str, enum markup_mode markup_mode)
310 {
311         ASSERT_OR_RET(str, NULL);
312 
313         switch (markup_mode) {
314         case MARKUP_NULL:
315                 /* `assert(false)`, but with a meaningful error message */
316                 assert(markup_mode != MARKUP_NULL);
317                 break;
318         case MARKUP_NO:
319                 str = markup_quote(str);
320                 break;
321         case MARKUP_STRIP:
322                 str = markup_br2nl(str);
323                 str = markup_strip(str);
324                 str = markup_quote(str);
325                 break;
326         case MARKUP_FULL:
327                 str = markup_escape_unsupported(str);
328                 str = markup_br2nl(str);
329                 markup_strip_a(&str, NULL);
330                 markup_strip_img(&str, NULL);
331                 break;
332         }
333 
334         if (settings.ignore_newline) {
335                 str = string_replace_all("\n", " ", str);
336         }
337 
338         return str;
339 }
340 
341 /* vim: set ft=c tabstop=8 shiftwidth=8 expandtab textwidth=0: */
342