1 /*
2  * Copyright (C) 2013 Nikos Mavrogiannopoulos
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <config.h>
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <stddef.h>
23 #include <string.h>
24 #include <unistd.h>
25 #include <syslog.h>
26 #include <common.h>
27 #include <c-strcase.h>
28 #include <c-ctype.h>
29 #include <wchar.h>
30 
31 #include "html.h"
32 
unescape_html(void * pool,const char * html,unsigned len,unsigned * out_len)33 char *unescape_html(void *pool, const char *html, unsigned len, unsigned *out_len)
34 {
35 	char *msg;
36 	int pos;
37 	unsigned i;
38 
39 	msg = talloc_size(pool, len + 1);
40 	if (msg == NULL)
41 		return NULL;
42 
43 	for (i = pos = 0; i < len;) {
44 		if (len-pos < 1) {
45 			goto fail;
46 		}
47 
48 		if (html[i] == '&') {
49 			if (!c_strncasecmp(&html[i], "&lt;", 4)) {
50 				msg[pos++] = '<';
51 				i += 4;
52 			} else if (!c_strncasecmp(&html[i], "&gt;", 4)) {
53 				msg[pos++] = '>';
54 				i += 4;
55 			} else if (!c_strncasecmp(&html[i], "&nbsp;", 6)) {
56 				msg[pos++] = ' ';
57 				i += 6;
58 			} else if (!c_strncasecmp(&html[i], "&quot;", 6)) {
59 				msg[pos++] = '"';
60 				i += 6;
61 			} else if (!c_strncasecmp(&html[i], "&amp;", 5)) {
62 				msg[pos++] = '&';
63 				i += 5;
64 			} else if (!c_strncasecmp(&html[i], "&apos;", 6)) {
65 				msg[pos++] = '\'';
66 				i += 6;
67 			} else if (!strncmp(&html[i], "&#", 2)) {
68 				const char *p = &html[i];
69 				char *endptr = NULL;
70 				long val;
71 
72 				if (p[2]=='x') {
73 					p += 3;
74 					val = strtol(p, &endptr, 16);
75 				} else {
76 					p += 2;
77 					val = strtol(p, &endptr, 10);
78 				}
79 				if (endptr == NULL || *endptr != ';' || val > WCHAR_MAX) {
80 					/* skip */
81 					msg[pos++] = html[i++];
82 				} else {
83 					char tmpmb[MB_CUR_MAX];
84 					wchar_t ch = val;
85 					mbstate_t ps;
86 					memset(&ps, 0, sizeof(ps));
87 
88 					i += (ptrdiff_t)(1+endptr-(&html[i]));
89 					val = wcrtomb(tmpmb, ch, &ps);
90 
91 					if (val == -1)
92 						goto fail;
93 					if (len-pos > val)
94 						memcpy(&msg[pos], tmpmb, val);
95 					else
96 						goto fail;
97 					pos += val;
98 				}
99 			} else
100 				msg[pos++] = html[i++];
101 		} else
102 			msg[pos++] = html[i++];
103 	}
104 
105 	msg[pos] = 0;
106 	if (out_len)
107 		*out_len = pos;
108 
109 	return msg;
110  fail:
111  	talloc_free(msg);
112  	return NULL;
113 }
114 
unescape_url(void * pool,const char * url,unsigned len,unsigned * out_len)115 char *unescape_url(void *pool, const char *url, unsigned len, unsigned *out_len)
116 {
117 	char *msg;
118 	int pos;
119 	unsigned i;
120 
121 	msg = talloc_size(pool, len + 1);
122 	if (msg == NULL)
123 		return NULL;
124 
125 	for (i = pos = 0; i < len;) {
126 		if (url[i] == '%') {
127 			char b[3];
128 			unsigned int u;
129 
130 			b[0] = url[i + 1];
131 			b[1] = url[i + 2];
132 			b[2] = 0;
133 
134 			if (sscanf(b, "%02x", &u) <= 0) {
135 				talloc_free(msg);
136 				syslog(LOG_ERR, "%s: error parsing URL: %s", __func__, url);
137 				return NULL;
138 			}
139 
140 			msg[pos++] = u;
141 			i += 3;
142 		} else if (url[i] == '+') {
143 			msg[pos++] = ' ';
144 			i++;
145 		} else
146 			msg[pos++] = url[i++];
147 	}
148 
149 	msg[pos] = 0;
150 	if (out_len)
151 		*out_len = pos;
152 
153 	return msg;
154 }
155 
escape_url(void * pool,const char * url,unsigned len,unsigned * out_len)156 char *escape_url(void *pool, const char *url, unsigned len, unsigned *out_len)
157 {
158 	char *msg;
159 	int pos;
160 	unsigned i;
161 
162 	msg = talloc_size(pool, 3*len + 1);
163 	if (msg == NULL)
164 		return NULL;
165 
166 	for (i = pos = 0; i < len;) {
167 		if (c_isalnum(url[i]) || url[i]=='-' || url[i]=='_' || url[i]=='.' || url[i]=='~') {
168 			msg[pos++] = url[i++];
169 		} else if (url[i] == ' ') {
170 			msg[pos++] = '+';
171 			i++;
172 		} else {
173 			snprintf(&msg[pos], 4, "%%%02X", (unsigned)url[i++]);
174 			pos+=3;
175 		}
176 	}
177 	msg[pos] = 0;
178 	if (out_len)
179 		*out_len = pos;
180 
181 	return msg;
182 }
183 
184