1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "udm_uniconv.h"
24 #include "udm_sgml.h"
25 #include "udm_unicode.h"
26
27
28 static const struct udm_sgml_chars
29 {
30 const char *sgml;
31 int unicode;
32 } SGMLChars[] = {
33
34 { "lt", '<' },
35 { "gt", '>' },
36 { "amp", '&' },
37 { "quot", '"' },
38
39 { "nbsp", 0xA0 }, /* non breaking space */
40
41 /* ISO-8859-1 entities */
42
43 { "iexcl", 161 }, /* inverted exclamation mark */
44 { "cent", 162 }, /* cent sign */
45 { "pound", 163 }, /* pound sign */
46 { "curren", 164 }, /* currency sign */
47 { "yen", 165 }, /* yen sign */
48 { "brvbar", 166 }, /* broken vertical bar, (brkbar) */
49 { "sect", 167 }, /* section sign */
50 { "uml", 168 }, /* spacing diaresis */
51 { "copy", 169 }, /* copyright sign */
52 { "ordf", 170 }, /* feminine ordinal indicator */
53 { "laquo", 171 }, /* angle quotation mark, left */
54 { "not", 172 }, /* negation sign */
55 { "shy", 173 }, /* soft hyphen */
56 { "reg", 174 }, /* circled R registered sign */
57 { "hibar", 175 }, /* spacing macron */
58 { "deg", 176 }, /* degree sign */
59 { "plusmn", 177 }, /* plus-or-minus sign */
60 { "sup2", 178 }, /* superscript 2 */
61 { "sup3", 179 }, /* superscript 3 */
62 { "acute", 180 }, /* spacing acute (96) */
63 { "micro", 181 }, /* micro sign */
64 { "para", 182 }, /* paragraph sign */
65 { "middot", 183 }, /* middle dot */
66 { "cedil", 184 }, /* spacing cedilla */
67 { "sup1", 185 }, /* superscript 1 */
68 { "ordm", 186 }, /* masculine ordinal indicator */
69 { "raquo", 187 }, /* angle quotation mark, right */
70 { "frac14", 188 }, /* fraction 1/4 */
71 { "frac12", 189 }, /* fraction 1/2 */
72 { "frac34", 190 }, /* fraction 3/4 */
73 { "iquest", 191 }, /* inverted question mark */
74 { "Agrave", 192 }, /* capital A, grave accent */
75 { "Aacute", 193 }, /* capital A, acute accent */
76 { "Acirc", 194 }, /* capital A, circumflex accent */
77 { "Atilde", 195 }, /* capital A, tilde */
78 { "Auml", 196 }, /* capital A, dieresis or umlaut mark */
79 { "Aring", 197 }, /* capital A, ring */
80 { "AElig", 198 }, /* capital AE diphthong (ligature) */
81 { "Ccedil", 199 }, /* capital C, cedilla */
82 { "Egrave", 200 }, /* capital E, grave accent */
83 { "Eacute", 201 }, /* capital E, acute accent */
84 { "Ecirc", 202 }, /* capital E, circumflex accent */
85 { "Euml", 203 }, /* capital E, dieresis or umlaut mark */
86 { "Igrave", 205 }, /* capital I, grave accent */
87 { "Iacute", 204 }, /* capital I, acute accent */
88 { "Icirc", 206 }, /* capital I, circumflex accent */
89 { "Iuml", 207 }, /* capital I, dieresis or umlaut mark */
90 { "ETH", 208 }, /* capital Eth, Icelandic (Dstrok) */
91 { "Ntilde", 209 }, /* capital N, tilde */
92 { "Ograve", 210 }, /* capital O, grave accent */
93 { "Oacute", 211 }, /* capital O, acute accent */
94 { "Ocirc", 212 }, /* capital O, circumflex accent */
95 { "Otilde", 213 }, /* capital O, tilde */
96 { "Ouml", 214 }, /* capital O, dieresis or umlaut mark */
97 { "times", 215 }, /* multiplication sign */
98 { "Oslash", 216 }, /* capital O, slash */
99 { "Ugrave", 217 }, /* capital U, grave accent */
100 { "Uacute", 218 }, /* capital U, acute accent */
101 { "Ucirc", 219 }, /* capital U, circumflex accent */
102 { "Uuml", 220 }, /* capital U, dieresis or umlaut mark */
103 { "Yacute", 221 }, /* capital Y, acute accent */
104 { "THORN", 222 }, /* capital THORN, Icelandic */
105 { "szlig", 223 }, /* small sharp s, German (sz ligature) */
106 { "agrave", 224 }, /* small a, grave accent */
107 { "aacute", 225 }, /* small a, acute accent */
108 { "acirc", 226 }, /* small a, circumflex accent */
109 { "atilde", 227 }, /* small a, tilde */
110 { "auml", 228 }, /* small a, dieresis or umlaut mark */
111 { "aring", 229 }, /* small a, ring */
112 { "aelig", 230 }, /* small ae diphthong (ligature) */
113 { "ccedil", 231 }, /* small c, cedilla */
114 { "egrave", 232 }, /* small e, grave accent */
115 { "eacute", 233 }, /* small e, acute accent */
116 { "ecirc", 234 }, /* small e, circumflex accent */
117 { "euml", 235 }, /* small e, dieresis or umlaut mark */
118 { "igrave", 236 }, /* small i, grave accent */
119 { "iacute", 237 }, /* small i, acute accent */
120 { "icirc", 238 }, /* small i, circumflex accent */
121 { "iuml", 239 }, /* small i, dieresis or umlaut mark */
122 { "eth", 240 }, /* small eth, Icelandic */
123 { "ntilde", 241 }, /* small n, tilde */
124 { "ograve", 242 }, /* small o, grave accent */
125 { "oacute", 243 }, /* small o, acute accent */
126 { "ocirc", 244 }, /* small o, circumflex accent */
127 { "otilde", 245 }, /* small o, tilde */
128 { "ouml", 246 }, /* small o, dieresis or umlaut mark */
129 { "divide", 247 }, /* division sign */
130 { "oslash", 248 }, /* small o, slash */
131 { "ugrave", 249 }, /* small u, grave accent */
132 { "uacute", 250 }, /* small u, acute accent */
133 { "ucirc", 251 }, /* small u, circumflex accent */
134 { "uuml", 252 }, /* small u, dieresis or umlaut mark */
135 { "yacute", 253 }, /* small y, acute accent */
136 { "thorn", 254 }, /* small thorn, Icelandic */
137 { "yuml", 255 }, /* small y, dieresis or umlaut mark */
138
139 /* Latin Extended-A */
140
141 { "OElig", 338 },
142 { "oelig", 339 },
143 { "Scaron", 352 },
144 { "scaron", 353 },
145 { "Yuml", 376 },
146
147 /* Latin-Extended-B */
148
149 { "fnof", 402 },
150
151
152 /* Accents */
153
154 { "circ", 710 },
155 { "tilde", 732 },
156
157 /* Greek */
158
159 { "Alpha", 913 },
160 { "Beta", 914 },
161 { "Gamma", 915 },
162 { "Delta", 916 },
163 { "Epsilon", 917 },
164 { "Zeta", 918 },
165 { "Eta", 919 },
166 { "Theta", 920 },
167 { "Iota", 921 },
168 { "Kappa", 922 },
169 { "Lambda", 923 },
170 { "Mu", 924 },
171 { "Nu", 925 },
172 { "Xi", 926 },
173 { "Omicron", 927 },
174 { "Pi", 928 },
175 { "Rho", 929 },
176 { "Sigma", 931 },
177 { "Tau", 932 },
178 { "Upsilon", 933 },
179 { "Phi", 934 },
180 { "Chi", 935 },
181 { "Psi", 936 },
182 { "Omega", 937 },
183 { "alpha", 945 },
184 { "beta", 946 },
185 { "gamma", 947 },
186 { "delta", 948 },
187 { "epsilon", 949 },
188 { "zeta", 950 },
189 { "eta", 951 },
190 { "theta", 952 },
191 { "iota", 953 },
192 { "kappa", 954 },
193 { "lambda", 955 },
194 { "mu", 956 },
195 { "nu", 957 },
196 { "xi", 958 },
197 { "omicron", 959 },
198 { "pi", 960 },
199 { "rho", 961 },
200 { "sigmaf", 962 },
201 { "sigma", 963 },
202 { "tau", 964 },
203 { "upsilon", 965 },
204 { "phi", 966 },
205 { "chi", 967 },
206 { "psi", 968 },
207 { "omega", 969 },
208 { "thetasym", 977 },
209 { "upsih", 978 },
210 { "piv", 982 },
211
212
213 /* Punctuation */
214
215 { "ensp", 8194 }, /* en space */
216 { "emsp", 8195 }, /* em space */
217 { "thinsp", 8201 },
218 { "zwnj", 8204 }, /* zero width non-joiner */
219 { "zwj", 8205 }, /* zero width joiner */
220 { "lrm", 8206 }, /* left-to-right mark */
221 { "rlm", 8207 }, /* right-to-left mark */
222 { "ndash", 8211 }, /* en dash */
223 { "mdash", 8212 }, /* em dash */
224 { "lsquo", 8216 },
225 { "rsquo", 8217 },
226 { "sbquo", 8218 },
227 { "ldquo", 8220 },
228 { "rdquo", 8221 },
229 { "bdquo", 8222 },
230 { "dagger", 8224 },
231 { "Dagger", 8225 },
232 { "bull", 8226 },
233 { "hellip", 8230 },
234 { "permil", 8240 },
235 { "prime", 8242 },
236 { "Prime", 8243 },
237 { "lsaquo", 8249 },
238 { "rsaquo", 8250 },
239 { "oline", 8254 },
240 { "frasl", 8260 },
241 { "euro", 8364 },
242
243
244 /* Letter type characters */
245
246 { "weierp", 8472 },
247 { "image", 8465 },
248 { "real", 8476 },
249 { "trade", 8482 },
250 { "alefsym", 8501 },
251
252 /* Arrows */
253
254 { "larr", 8592 },
255 { "uarr", 8593 },
256 { "rarr", 8594 },
257 { "darr", 8595 },
258 { "harr", 8596 },
259 { "crarr", 8629 },
260 { "lArr", 8656 },
261 { "uArr", 8657 },
262 { "rArr", 8658 },
263 { "dArr", 8659 },
264 { "hArr", 8660 },
265
266 /* Math characters */
267
268 { "forall", 8704 },
269 { "part", 8706 },
270 { "exist", 8707 },
271 { "empty", 8709 },
272 { "nabla", 8711 },
273 { "isin", 8712 },
274 { "notin", 8713 },
275 { "ni", 8715 },
276 { "prod", 8719 },
277 { "sum", 8721 },
278 { "minus", 8722 },
279 { "lowast", 8727 },
280 { "radic", 8730 },
281 { "prop", 8733 },
282 { "infin", 8734 },
283 { "ang", 8736 },
284 { "and", 8743 },
285 { "or", 8744 },
286 { "cap", 8745 },
287 { "cup", 8746 },
288 { "int", 8747 },
289 { "there4", 8756 },
290 { "sim", 8764 },
291 { "cong", 8773 },
292 { "asymp", 8776 },
293 { "ne", 8800 },
294 { "equiv", 8801 },
295 { "le", 8804 },
296 { "ge", 8805 },
297 { "sub", 8834 },
298 { "sup", 8835 },
299 { "nsub", 8836 },
300 { "sube", 8838 },
301 { "supe", 8839 },
302 { "oplus", 8853 },
303 { "otimes", 8855 },
304 { "perp", 8869 },
305 { "sdot", 8901 },
306
307 /* Misc tech characters */
308
309 { "lceil", 8968 },
310 { "rceil", 8969 },
311 { "lfloor", 8970 },
312 { "rfloor", 8971 },
313 { "lang", 9001 },
314 { "rang", 9002 },
315 { "loz", 9674 },
316 { "spades", 9824 },
317 { "clubs", 9827 },
318 { "hearts", 9829 },
319 { "diams", 9830 },
320
321 /* END Marker */
322
323 { "", 0 }};
324
325
UdmSgmlToUni(const char * sgml)326 udm_wc_t UdmSgmlToUni(const char *sgml)
327 {
328 const struct udm_sgml_chars *p;
329 for(p= SGMLChars; p->unicode; p++)
330 {
331 const char *s, *t;
332 for (s= sgml, t= p->sgml; *s == *t ; s++, t++);
333 if (!*t)
334 return p->unicode;
335 }
336 return 0;
337 }
338
339
340 /*
341 Scan an entity. Return number of bytes scanned.
342 */
UdmSGMLScan(udm_wc_t * wc,const unsigned char * str,const unsigned char * end)343 int UdmSGMLScan(udm_wc_t *wc, const unsigned char *str, const unsigned char *end)
344 {
345 const unsigned char *p, *end10= str + 10;
346 if (end10 > end)
347 end10= end;
348 for (p= str + 2; p < end10; p++)
349 {
350 if (*p == ';')
351 {
352 if (str[1] == '#')
353 {
354 if (str[2] == 'x' || str[2] == 'X')
355 *wc= strtoul((const char*)str + 3, NULL, 16);
356 else
357 *wc= strtoul((const char*)str + 2, NULL, 10);
358 if (*wc > 0x10FFFF)
359 *wc= '?';
360 }
361 else
362 {
363 *wc= UdmSgmlToUni((const char*) str + 1);
364 }
365 if (*wc)
366 return p - str + 1;
367 }
368 }
369 *wc= '&';
370 return 1;
371 }
372
373
374 /** This function replaces SGML entities
375 With their character equivalents
376 */
377
378 #define UDM_MAX_SGML_LEN 20
379
380 UDM_API(char *)
UdmSGMLUnescape(char * str)381 UdmSGMLUnescape(char * str)
382 {
383 char *s= str;
384
385 while (*s)
386 {
387 if (*s == '&')
388 {
389 if (s[1] == '#')
390 {
391 char *e;
392 for(e= s + 2; (e - s < UDM_MAX_SGML_LEN) &&
393 (*e <= '9') && (*e >= '0'); e++);
394 if(*e == ';')
395 {
396 int v= atoi(s + 2);
397 if (v >= 0 && v <= 255)
398 {
399 *s= (char) v;
400 }
401 else
402 {
403 *s = ' ';
404 }
405 memmove(s + 1, e + 1, strlen(e + 1) + 1);
406 }
407 }
408 else
409 {
410 char *e, c;
411 for (e= s + 1; (e - s < UDM_MAX_SGML_LEN) &&
412 (((*e <= 'z') && (*e >= 'a'))||
413 ((*e <= 'Z') && (*e >= 'A')));e++);
414
415 if ((*e == ';') && (c= (char) UdmSgmlToUni(s + 1)))
416 {
417 *s= c;
418 memmove(s + 1, e + 1, strlen(e + 1) + 1);
419 }
420 }
421 }
422 s++;
423 }
424 return str;
425 }
426
427
428 /** This function replaces SGML entities
429 With their UNICODE equivalents
430 */
UdmSGMLUniUnescape(int * ustr)431 void UdmSGMLUniUnescape(int * ustr)
432 {
433 int *s= ustr;
434
435 for ( ; *s; s++)
436 {
437 if (*s == '&')
438 {
439 char sgml[UDM_MAX_SGML_LEN + 1];
440 int i= 0;
441 if (s[1] == '#')
442 {
443 int *e;
444 for (e= s + 2; (e - s < UDM_MAX_SGML_LEN) &&
445 (*e <= '9') && (*e >= '0'); e++);
446 if(*e == ';')
447 {
448 for(i= 2; s + i < e; i++)
449 sgml[i - 2]= s[i];
450 sgml[i - 2] = '\0';
451 *s= atoi(sgml);
452 memmove(s + 1, e + 1, sizeof(int) * (UdmUniLen(e + 1) + 1));
453 }
454 }
455 else
456 {
457 int c, *e;
458 for(e= s + 1; (e - s < UDM_MAX_SGML_LEN)&&
459 (((*e <= 'z') && (*e >= 'a'))||
460 ((*e <= 'Z') && (*e >= 'A'))); e++)
461 {
462 sgml[i]= (char) *e;
463 i++;
464 }
465 sgml[i]= 0;
466 if((*e == ';') && (c= UdmSgmlToUni(sgml)))
467 {
468 *s= c;
469 memmove(s + 1, e + 1, sizeof(int) * (UdmUniLen(e + 1) + 1));
470 }
471 }
472 }
473 }
474 }
475
476
UdmHTMLEncode(char * dst,size_t dstlen,const char * src,size_t srclen)477 size_t UdmHTMLEncode(char *dst, size_t dstlen, const char *src, size_t srclen)
478 {
479 char *dst0= dst;
480
481 for ( ; srclen ; srclen--, src++)
482 {
483 const char *ch;
484 size_t chlen;
485 switch (*src)
486 {
487 case '&':
488 ch= "&";
489 chlen= 5;
490 break;
491 case '"':
492 ch= """;
493 chlen= 6;
494 break;
495 case '<':
496 ch= "<";
497 chlen= 4;
498 break;
499 case '>':
500 ch= ">";
501 chlen= 4;
502 break;
503 default:
504 ch= src;
505 chlen= 1;
506 }
507 if (chlen > dstlen)
508 break;
509 if (chlen == 1)
510 *dst= *ch;
511 else
512 memcpy(dst, ch, chlen);
513 dst+= chlen;
514 dstlen-= chlen;
515 }
516 return dst - dst0;
517 }
518