1 /*
2  * uriparser - RFC 3986 URI parsing library
3  *
4  * Copyright (C) 2007, Weijia Song <songweijia@gmail.com>
5  * Copyright (C) 2007, Sebastian Pipping <sebastian@pipping.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source  and binary forms, with or without
9  * modification, are permitted provided  that the following conditions
10  * are met:
11  *
12  *     1. Redistributions  of  source  code   must  retain  the  above
13  *        copyright notice, this list  of conditions and the following
14  *        disclaimer.
15  *
16  *     2. Redistributions  in binary  form  must  reproduce the  above
17  *        copyright notice, this list  of conditions and the following
18  *        disclaimer  in  the  documentation  and/or  other  materials
19  *        provided with the distribution.
20  *
21  *     3. Neither the  name of the  copyright holder nor the  names of
22  *        its contributors may be used  to endorse or promote products
23  *        derived from  this software  without specific  prior written
24  *        permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27  * "AS IS" AND  ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING, BUT NOT
28  * LIMITED TO,  THE IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS
29  * FOR  A  PARTICULAR  PURPOSE  ARE  DISCLAIMED.  IN  NO  EVENT  SHALL
30  * THE  COPYRIGHT HOLDER  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT,
31  * INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR CONSEQUENTIAL DAMAGES
32  * (INCLUDING, BUT NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE GOODS OR
33  * SERVICES; LOSS OF USE, DATA,  OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
35  * STRICT  LIABILITY,  OR  TORT (INCLUDING  NEGLIGENCE  OR  OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
37  * OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /* What encodings are enabled? */
41 #include <uriparser/UriDefsConfig.h>
42 #if (!defined(URI_PASS_ANSI) && !defined(URI_PASS_UNICODE))
43 /* Include SELF twice */
44 # ifdef URI_ENABLE_ANSI
45 #  define URI_PASS_ANSI 1
46 #  include "UriEscape.c"
47 #  undef URI_PASS_ANSI
48 # endif
49 # ifdef URI_ENABLE_UNICODE
50 #  define URI_PASS_UNICODE 1
51 #  include "UriEscape.c"
52 #  undef URI_PASS_UNICODE
53 # endif
54 #else
55 # ifdef URI_PASS_ANSI
56 #  include <uriparser/UriDefsAnsi.h>
57 # else
58 #  include <uriparser/UriDefsUnicode.h>
59 #  include <wchar.h>
60 # endif
61 
62 
63 
64 #ifndef URI_DOXYGEN
65 # include <uriparser/Uri.h>
66 # include "UriCommon.h"
67 #endif
68 
69 
70 
URI_FUNC(Escape)71 URI_CHAR * URI_FUNC(Escape)(const URI_CHAR * in, URI_CHAR * out,
72 		UriBool spaceToPlus, UriBool normalizeBreaks) {
73 	return URI_FUNC(EscapeEx)(in, NULL, out, spaceToPlus, normalizeBreaks);
74 }
75 
76 
77 
URI_FUNC(EscapeEx)78 URI_CHAR * URI_FUNC(EscapeEx)(const URI_CHAR * inFirst,
79 		const URI_CHAR * inAfterLast, URI_CHAR * out,
80 		UriBool spaceToPlus, UriBool normalizeBreaks) {
81 	const URI_CHAR * read = inFirst;
82 	URI_CHAR * write = out;
83 	UriBool prevWasCr = URI_FALSE;
84 	if ((out == NULL) || (inFirst == out)) {
85 		return NULL;
86 	} else if (inFirst == NULL) {
87 		if (out != NULL) {
88 			out[0] = _UT('\0');
89 		}
90 		return out;
91 	}
92 
93 	for (;;) {
94 		if ((inAfterLast != NULL) && (read >= inAfterLast)) {
95 			write[0] = _UT('\0');
96 			return write;
97 		}
98 
99 		switch (read[0]) {
100 		case _UT('\0'):
101 			write[0] = _UT('\0');
102 			return write;
103 
104 		case _UT(' '):
105 			if (spaceToPlus) {
106 				write[0] = _UT('+');
107 				write++;
108 			} else {
109 				write[0] = _UT('%');
110 				write[1] = _UT('2');
111 				write[2] = _UT('0');
112 				write += 3;
113 			}
114 			prevWasCr = URI_FALSE;
115 			break;
116 
117 		case _UT('a'): /* ALPHA */
118 		case _UT('A'):
119 		case _UT('b'):
120 		case _UT('B'):
121 		case _UT('c'):
122 		case _UT('C'):
123 		case _UT('d'):
124 		case _UT('D'):
125 		case _UT('e'):
126 		case _UT('E'):
127 		case _UT('f'):
128 		case _UT('F'):
129 		case _UT('g'):
130 		case _UT('G'):
131 		case _UT('h'):
132 		case _UT('H'):
133 		case _UT('i'):
134 		case _UT('I'):
135 		case _UT('j'):
136 		case _UT('J'):
137 		case _UT('k'):
138 		case _UT('K'):
139 		case _UT('l'):
140 		case _UT('L'):
141 		case _UT('m'):
142 		case _UT('M'):
143 		case _UT('n'):
144 		case _UT('N'):
145 		case _UT('o'):
146 		case _UT('O'):
147 		case _UT('p'):
148 		case _UT('P'):
149 		case _UT('q'):
150 		case _UT('Q'):
151 		case _UT('r'):
152 		case _UT('R'):
153 		case _UT('s'):
154 		case _UT('S'):
155 		case _UT('t'):
156 		case _UT('T'):
157 		case _UT('u'):
158 		case _UT('U'):
159 		case _UT('v'):
160 		case _UT('V'):
161 		case _UT('w'):
162 		case _UT('W'):
163 		case _UT('x'):
164 		case _UT('X'):
165 		case _UT('y'):
166 		case _UT('Y'):
167 		case _UT('z'):
168 		case _UT('Z'):
169 		case _UT('0'): /* DIGIT */
170 		case _UT('1'):
171 		case _UT('2'):
172 		case _UT('3'):
173 		case _UT('4'):
174 		case _UT('5'):
175 		case _UT('6'):
176 		case _UT('7'):
177 		case _UT('8'):
178 		case _UT('9'):
179 		case _UT('-'): /* "-" / "." / "_" / "~" */
180 		case _UT('.'):
181 		case _UT('_'):
182 		case _UT('~'):
183 			/* Copy unmodified */
184 			write[0] = read[0];
185 			write++;
186 
187 			prevWasCr = URI_FALSE;
188 			break;
189 
190 		case _UT('\x0a'):
191 			if (normalizeBreaks) {
192 				if (!prevWasCr) {
193 					write[0] = _UT('%');
194 					write[1] = _UT('0');
195 					write[2] = _UT('D');
196 					write[3] = _UT('%');
197 					write[4] = _UT('0');
198 					write[5] = _UT('A');
199 					write += 6;
200 				}
201 			} else {
202 				write[0] = _UT('%');
203 				write[1] = _UT('0');
204 				write[2] = _UT('A');
205 				write += 3;
206 			}
207 			prevWasCr = URI_FALSE;
208 			break;
209 
210 		case _UT('\x0d'):
211 			if (normalizeBreaks) {
212 				write[0] = _UT('%');
213 				write[1] = _UT('0');
214 				write[2] = _UT('D');
215 				write[3] = _UT('%');
216 				write[4] = _UT('0');
217 				write[5] = _UT('A');
218 				write += 6;
219 			} else {
220 				write[0] = _UT('%');
221 				write[1] = _UT('0');
222 				write[2] = _UT('D');
223 				write += 3;
224 			}
225 			prevWasCr = URI_TRUE;
226 			break;
227 
228 		default:
229 			/* Percent encode */
230 			{
231 				const unsigned char code = (unsigned char)read[0];
232 				write[0] = _UT('%');
233 				write[1] = URI_FUNC(HexToLetter)(code >> 4);
234 				write[2] = URI_FUNC(HexToLetter)(code & 0x0f);
235 				write += 3;
236 			}
237 			prevWasCr = URI_FALSE;
238 			break;
239 		}
240 
241 		read++;
242 	}
243 }
244 
245 
246 
URI_FUNC(UnescapeInPlace)247 const URI_CHAR * URI_FUNC(UnescapeInPlace)(URI_CHAR * inout) {
248 	return URI_FUNC(UnescapeInPlaceEx)(inout, URI_FALSE, URI_BR_DONT_TOUCH);
249 }
250 
251 
252 
URI_FUNC(UnescapeInPlaceEx)253 const URI_CHAR * URI_FUNC(UnescapeInPlaceEx)(URI_CHAR * inout,
254 		UriBool plusToSpace, UriBreakConversion breakConversion) {
255 	URI_CHAR * read = inout;
256 	URI_CHAR * write = inout;
257 	UriBool prevWasCr = URI_FALSE;
258 
259 	if (inout == NULL) {
260 		return NULL;
261 	}
262 
263 	for (;;) {
264 		switch (read[0]) {
265 		case _UT('\0'):
266 			if (read > write) {
267 				write[0] = _UT('\0');
268 			}
269 			return write;
270 
271 		case _UT('%'):
272 			switch (read[1]) {
273 			case _UT('0'):
274 			case _UT('1'):
275 			case _UT('2'):
276 			case _UT('3'):
277 			case _UT('4'):
278 			case _UT('5'):
279 			case _UT('6'):
280 			case _UT('7'):
281 			case _UT('8'):
282 			case _UT('9'):
283 			case _UT('a'):
284 			case _UT('b'):
285 			case _UT('c'):
286 			case _UT('d'):
287 			case _UT('e'):
288 			case _UT('f'):
289 			case _UT('A'):
290 			case _UT('B'):
291 			case _UT('C'):
292 			case _UT('D'):
293 			case _UT('E'):
294 			case _UT('F'):
295 				switch (read[2]) {
296 				case _UT('0'):
297 				case _UT('1'):
298 				case _UT('2'):
299 				case _UT('3'):
300 				case _UT('4'):
301 				case _UT('5'):
302 				case _UT('6'):
303 				case _UT('7'):
304 				case _UT('8'):
305 				case _UT('9'):
306 				case _UT('a'):
307 				case _UT('b'):
308 				case _UT('c'):
309 				case _UT('d'):
310 				case _UT('e'):
311 				case _UT('f'):
312 				case _UT('A'):
313 				case _UT('B'):
314 				case _UT('C'):
315 				case _UT('D'):
316 				case _UT('E'):
317 				case _UT('F'):
318 					{
319 						/* Percent group found */
320 						const unsigned char left = URI_FUNC(HexdigToInt)(read[1]);
321 						const unsigned char right = URI_FUNC(HexdigToInt)(read[2]);
322 						const int code = 16 * left + right;
323 						switch (code) {
324 						case 10:
325 							switch (breakConversion) {
326 							case URI_BR_TO_LF:
327 								if (!prevWasCr) {
328 									write[0] = (URI_CHAR)10;
329 									write++;
330 								}
331 								break;
332 
333 							case URI_BR_TO_CRLF:
334 								if (!prevWasCr) {
335 									write[0] = (URI_CHAR)13;
336 									write[1] = (URI_CHAR)10;
337 									write += 2;
338 								}
339 								break;
340 
341 							case URI_BR_TO_CR:
342 								if (!prevWasCr) {
343 									write[0] = (URI_CHAR)13;
344 									write++;
345 								}
346 								break;
347 
348 							case URI_BR_DONT_TOUCH:
349 							default:
350 								write[0] = (URI_CHAR)10;
351 								write++;
352 
353 							}
354 							prevWasCr = URI_FALSE;
355 							break;
356 
357 						case 13:
358 							switch (breakConversion) {
359 							case URI_BR_TO_LF:
360 								write[0] = (URI_CHAR)10;
361 								write++;
362 								break;
363 
364 							case URI_BR_TO_CRLF:
365 								write[0] = (URI_CHAR)13;
366 								write[1] = (URI_CHAR)10;
367 								write += 2;
368 								break;
369 
370 							case URI_BR_TO_CR:
371 								write[0] = (URI_CHAR)13;
372 								write++;
373 								break;
374 
375 							case URI_BR_DONT_TOUCH:
376 							default:
377 								write[0] = (URI_CHAR)13;
378 								write++;
379 
380 							}
381 							prevWasCr = URI_TRUE;
382 							break;
383 
384 						default:
385 							write[0] = (URI_CHAR)(code);
386 							write++;
387 
388 							prevWasCr = URI_FALSE;
389 
390 						}
391 						read += 3;
392 					}
393 					break;
394 
395 				default:
396 					/* Copy two chars unmodified and */
397 					/* look at this char again */
398 					if (read > write) {
399 						write[0] = read[0];
400 						write[1] = read[1];
401 					}
402 					read += 2;
403 					write += 2;
404 
405 					prevWasCr = URI_FALSE;
406 				}
407 				break;
408 
409 			default:
410 				/* Copy one char unmodified and */
411 				/* look at this char again */
412 				if (read > write) {
413 					write[0] = read[0];
414 				}
415 				read++;
416 				write++;
417 
418 				prevWasCr = URI_FALSE;
419 			}
420 			break;
421 
422 		case _UT('+'):
423 			if (plusToSpace) {
424 				/* Convert '+' to ' ' */
425 				write[0] = _UT(' ');
426 			} else {
427 				/* Copy one char unmodified */
428 				if (read > write) {
429 					write[0] = read[0];
430 				}
431 			}
432 			read++;
433 			write++;
434 
435 			prevWasCr = URI_FALSE;
436 			break;
437 
438 		default:
439 			/* Copy one char unmodified */
440 			if (read > write) {
441 				write[0] = read[0];
442 			}
443 			read++;
444 			write++;
445 
446 			prevWasCr = URI_FALSE;
447 		}
448 	}
449 }
450 
451 
452 
453 #endif
454