1 /* copyright 2013 Sascha Kruse and contributors (see LICENSE for licensing information) */
2
3 #include "markup.h"
4
5 #include <assert.h>
6 #include <ctype.h>
7 #include <stdbool.h>
8 #include <stdio.h>
9 #include <string.h>
10
11 #include "log.h"
12 #include "settings.h"
13 #include "utils.h"
14
15 /**
16 * Convert all HTML special symbols to HTML entities.
17 * @param str (nullable)
18 */
markup_quote(char * str)19 static char *markup_quote(char *str)
20 {
21 ASSERT_OR_RET(str, NULL);
22
23 str = string_replace_all("&", "&", str);
24 str = string_replace_all("\"", """, str);
25 str = string_replace_all("'", "'", str);
26 str = string_replace_all("<", "<", str);
27 str = string_replace_all(">", ">", str);
28
29 return str;
30 }
31
32 /**
33 * Convert all HTML special entities to their actual char.
34 * @param str (nullable)
35 */
markup_unquote(char * str)36 static char *markup_unquote(char *str)
37 {
38 ASSERT_OR_RET(str, NULL);
39
40 str = string_replace_all(""", "\"", str);
41 str = string_replace_all("'", "'", str);
42 str = string_replace_all("<", "<", str);
43 str = string_replace_all(">", ">", str);
44 str = string_replace_all("&", "&", str);
45
46 return str;
47 }
48
49 /**
50 * Convert all HTML linebreak tags to a newline character
51 * @param str (nullable)
52 */
markup_br2nl(char * str)53 static char *markup_br2nl(char *str)
54 {
55 ASSERT_OR_RET(str, NULL);
56
57 str = string_replace_all("<br>", "\n", str);
58 str = string_replace_all("<br/>", "\n", str);
59 str = string_replace_all("<br />", "\n", str);
60 return str;
61 }
62
63 /* see markup.h */
markup_strip_a(char ** str,char ** urls)64 void markup_strip_a(char **str, char **urls)
65 {
66 assert(*str);
67 char *tag1 = NULL;
68
69 if (urls)
70 *urls = NULL;
71
72 while ((tag1 = strstr(*str, "<a"))) {
73 // use href=" as stated in the notification spec
74 char *href = strstr(tag1, "href=\"");
75 char *tag1_end = strstr(tag1, ">");
76 char *tag2 = strstr(tag1, "</a>");
77
78 // the tag is broken, ignore it
79 if (!tag1_end) {
80 LOG_W("Given link is broken: '%s'",
81 tag1);
82 string_replace_at(*str, tag1-*str, strlen(tag1), "");
83 break;
84 }
85 if (tag2 && tag2 < tag1_end) {
86 int repl_len = (tag2 - tag1) + strlen("</a>");
87 LOG_W("Given link is broken: '%.*s.'",
88 repl_len, tag1);
89 string_replace_at(*str, tag1-*str, repl_len, "");
90 break;
91 }
92
93 // search contents of href attribute
94 char *plain_url = NULL;
95 if (href && href < tag1_end) {
96
97 // shift href to the actual begin of the value
98 href = href+6;
99
100 const char *quote = strstr(href, "\"");
101
102 if (quote && quote < tag1_end) {
103 plain_url = g_strndup(href, quote-href);
104 }
105 }
106
107 // text between a tags
108 int text_len;
109 if (tag2)
110 text_len = tag2 - (tag1_end+1);
111 else
112 text_len = strlen(tag1_end+1);
113
114 char *text = g_strndup(tag1_end+1, text_len);
115
116 int repl_len = text_len + (tag1_end-tag1) + 1;
117 repl_len += tag2 ? strlen("</a>") : 0;
118
119 *str = string_replace_at(*str, tag1-*str, repl_len, text);
120
121 // if there had been a href attribute,
122 // add it to the URLs
123 if (plain_url && urls) {
124 text = string_replace_all("]", "", text);
125 text = string_replace_all("[", "", text);
126
127 char *url = g_strdup_printf("[%s] %s", text, plain_url);
128
129 *urls = string_append(*urls, url, "\n");
130 g_free(url);
131 }
132
133 g_free(plain_url);
134 g_free(text);
135 }
136 }
137
138 /* see markup.h */
markup_strip_img(char ** str,char ** urls)139 void markup_strip_img(char **str, char **urls)
140 {
141 const char *start;
142
143 if (urls)
144 *urls = NULL;
145
146 while ((start = strstr(*str, "<img"))) {
147 const char *end = strstr(start, ">");
148
149 // the tag is broken, ignore it
150 if (!end) {
151 LOG_W("Given image is broken: '%s'", start);
152 string_replace_at(*str, start-*str, strlen(start), "");
153 break;
154 }
155
156 // use attribute=" as stated in the notification spec
157 const char *alt_s = strstr(start, "alt=\"");
158 const char *src_s = strstr(start, "src=\"");
159
160 char *text_alt = NULL;
161 char *text_src = NULL;
162
163 const char *src_e = NULL, *alt_e = NULL;
164 if (alt_s)
165 alt_e = strstr(alt_s + strlen("alt=\""), "\"");
166 if (src_s)
167 src_e = strstr(src_s + strlen("src=\""), "\"");
168
169 // Move pointer to the actual start
170 alt_s = alt_s ? alt_s + strlen("alt=\"") : NULL;
171 src_s = src_s ? src_s + strlen("src=\"") : NULL;
172
173 /* check if alt and src attribute are given
174 * If both given, check the alignment of all pointers */
175 if ( alt_s && alt_e
176 && src_s && src_e
177 && ( (alt_s < src_s && alt_e < src_s-strlen("src=\"") && src_e < end)
178 ||(src_s < alt_s && src_e < alt_s-strlen("alt=\"") && alt_e < end)) ) {
179
180 text_alt = g_strndup(alt_s, alt_e-alt_s);
181 text_src = g_strndup(src_s, src_e-src_s);
182
183 /* check if single valid alt attribute is available */
184 } else if (alt_s && alt_e && alt_e < end && (!src_s || src_s < alt_s || alt_e < src_s - strlen("src=\""))) {
185 text_alt = g_strndup(alt_s, alt_e-alt_s);
186
187 /* check if single valid src attribute is available */
188 } else if (src_s && src_e && src_e < end && (!alt_s || alt_s < src_s || src_e < alt_s - strlen("alt=\""))) {
189 text_src = g_strndup(src_s, src_e-src_s);
190
191 } else {
192 LOG_W("Given image argument is broken: '%.*s'",
193 (int)(end-start), start);
194 }
195
196 // replacement text for alt
197 int repl_len = end - start + 1;
198
199 if (!text_alt)
200 text_alt = g_strdup("[image]");
201
202 *str = string_replace_at(*str, start-*str, repl_len, text_alt);
203
204 // if there had been a href attribute,
205 // add it to the URLs
206 if (text_src && urls) {
207 text_alt = string_replace_all("]", "", text_alt);
208 text_alt = string_replace_all("[", "", text_alt);
209
210 char *url = g_strdup_printf("[%s] %s", text_alt, text_src);
211
212 *urls = string_append(*urls, url, "\n");
213 g_free(url);
214 }
215
216 g_free(text_src);
217 g_free(text_alt);
218 }
219 }
220
221 /* see markup.h */
markup_strip(char * str)222 char *markup_strip(char *str)
223 {
224 ASSERT_OR_RET(str, NULL);
225
226 /* strip all tags */
227 string_strip_delimited(str, '<', '>');
228
229 /* unquote the remainder */
230 str = markup_unquote(str);
231
232 return str;
233 }
234
235 /**
236 * Determine if an & character pointed to by \p str is a markup & entity or
237 * part of the text
238 *
239 * @retval true: \p str is an entity
240 * @retval false: It's no valid entity
241 */
markup_is_entity(const char * str)242 static bool markup_is_entity(const char *str)
243 {
244 assert(str);
245 assert(*str == '&');
246
247 char *end = strchr(str, ';');
248 ASSERT_OR_RET(end, false);
249
250 // Parse (hexa)decimal entities with the format Ӓ or ઼
251 if (str[1] == '#') {
252 const char *cur = str + 2;
253
254 if (*cur == 'x') {
255 cur++;
256
257 // Reject &#x;
258 if (*cur == ';')
259 return false;
260
261 while (isxdigit(*cur) && cur < end)
262 cur++;
263 } else {
264
265 // Reject &#;
266 if (*cur == ';')
267 return false;
268
269 while (isdigit(*cur) && cur < end)
270 cur++;
271 }
272
273 return (cur == end);
274 } else {
275 const char *supported_tags[] = {"&", "<", ">", """, "'"};
276 for (int i = 0; i < sizeof(supported_tags)/sizeof(*supported_tags); i++) {
277 if (g_str_has_prefix(str, supported_tags[i]))
278 return true;
279 }
280 return false;
281 }
282 }
283
284 /**
285 * Escape all unsupported and invalid &-entities in a string. If the resulting
286 * string does not fit it will be reallocated.
287 *
288 * @param str The string to be transformed
289 */
markup_escape_unsupported(char * str)290 static char *markup_escape_unsupported(char *str)
291 {
292 ASSERT_OR_RET(str, NULL);
293
294 char *match = str;
295 while ((match = strchr(match, '&'))) {
296 if (!markup_is_entity(match)) {
297 int pos = match - str;
298 str = string_replace_at(str, pos, 1, "&");
299 match = str + pos + strlen("&");
300 } else {
301 match++;
302 }
303 }
304
305 return str;
306 }
307
308 /* see markup.h */
markup_transform(char * str,enum markup_mode markup_mode)309 char *markup_transform(char *str, enum markup_mode markup_mode)
310 {
311 ASSERT_OR_RET(str, NULL);
312
313 switch (markup_mode) {
314 case MARKUP_NULL:
315 /* `assert(false)`, but with a meaningful error message */
316 assert(markup_mode != MARKUP_NULL);
317 break;
318 case MARKUP_NO:
319 str = markup_quote(str);
320 break;
321 case MARKUP_STRIP:
322 str = markup_br2nl(str);
323 str = markup_strip(str);
324 str = markup_quote(str);
325 break;
326 case MARKUP_FULL:
327 str = markup_escape_unsupported(str);
328 str = markup_br2nl(str);
329 markup_strip_a(&str, NULL);
330 markup_strip_img(&str, NULL);
331 break;
332 }
333
334 if (settings.ignore_newline) {
335 str = string_replace_all("\n", " ", str);
336 }
337
338 return str;
339 }
340
341 /* vim: set ft=c tabstop=8 shiftwidth=8 expandtab textwidth=0: */
342