1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <stdlib.h>
21 #include <string.h>
22 
23 #include "udm_uniconv.h"
24 #include "udm_sgml.h"
25 #include "udm_unicode.h"
26 
27 
28 static const struct udm_sgml_chars
29 {
30      const char     *sgml;
31      int       unicode;
32 } SGMLChars[] = {
33 
34 {    "lt",          '<'  },
35 {    "gt",          '>'  },
36 {    "amp",         '&'  },
37 {    "quot",        '"'  },
38 
39 {    "nbsp",        0xA0 },   /* non breaking space */
40 
41 /* ISO-8859-1 entities */
42 
43 {    "iexcl",  161  },   /* inverted exclamation mark */
44 {    "cent",        162  },   /* cent sign */
45 {    "pound",  163  },   /* pound sign */
46 {    "curren", 164  },   /* currency sign */
47 {    "yen",         165  },   /* yen sign */
48 {    "brvbar", 166  },   /* broken vertical bar, (brkbar) */
49 {    "sect",        167  },   /* section sign */
50 {    "uml",         168  },   /* spacing diaresis */
51 {    "copy",        169  },   /* copyright sign */
52 {    "ordf",        170  },   /* feminine ordinal indicator */
53 {    "laquo",  171  },   /* angle quotation mark, left */
54 {    "not",         172  },   /* negation sign */
55 {    "shy",         173  },   /* soft hyphen */
56 {    "reg",         174  },   /* circled R registered sign */
57 {    "hibar",  175  },   /* spacing macron */
58 {    "deg",         176  },   /* degree sign */
59 {    "plusmn", 177  },   /* plus-or-minus sign */
60 {    "sup2",        178  },   /* superscript 2 */
61 {    "sup3",        179  },   /* superscript 3 */
62 {    "acute",  180  },   /* spacing acute (96) */
63 {    "micro",  181  },   /* micro sign */
64 {    "para",        182  },   /* paragraph sign */
65 {    "middot", 183  },   /* middle dot */
66 {    "cedil",  184  },   /* spacing cedilla */
67 {    "sup1",        185  },   /* superscript 1 */
68 {    "ordm",        186  },   /* masculine ordinal indicator */
69 {    "raquo",  187  },   /* angle quotation mark, right */
70 {    "frac14", 188  },   /* fraction 1/4 */
71 {    "frac12", 189  },   /* fraction 1/2 */
72 {    "frac34", 190  },   /* fraction 3/4 */
73 {    "iquest", 191  },   /* inverted question mark */
74 {    "Agrave", 192  },   /* capital A, grave accent */
75 {    "Aacute", 193  },   /* capital A, acute accent */
76 {    "Acirc",  194  },   /* capital A, circumflex accent */
77 {    "Atilde", 195  },   /* capital A, tilde */
78 {    "Auml",        196  },   /* capital A, dieresis or umlaut mark */
79 {    "Aring",  197  },   /* capital A, ring */
80 {    "AElig",  198  },   /* capital AE diphthong (ligature) */
81 {    "Ccedil", 199  },   /* capital C, cedilla */
82 {    "Egrave", 200  },   /* capital E, grave accent */
83 {    "Eacute", 201  },   /* capital E, acute accent */
84 {    "Ecirc",  202  },   /* capital E, circumflex accent */
85 {    "Euml",        203  },   /* capital E, dieresis or umlaut mark */
86 {    "Igrave", 205  },   /* capital I, grave accent */
87 {    "Iacute", 204  },   /* capital I, acute accent */
88 {    "Icirc",  206  },   /* capital I, circumflex accent */
89 {    "Iuml",        207  },   /* capital I, dieresis or umlaut mark */
90 {    "ETH",         208  },   /* capital Eth, Icelandic (Dstrok) */
91 {    "Ntilde", 209  },   /* capital N, tilde */
92 {    "Ograve", 210  },   /* capital O, grave accent */
93 {    "Oacute", 211  },   /* capital O, acute accent */
94 {    "Ocirc",  212  },   /* capital O, circumflex accent */
95 {    "Otilde", 213  },   /* capital O, tilde */
96 {    "Ouml",        214  },   /* capital O, dieresis or umlaut mark */
97 {    "times",  215  },   /* multiplication sign */
98 {    "Oslash", 216  },   /* capital O, slash */
99 {    "Ugrave", 217  },   /* capital U, grave accent */
100 {    "Uacute", 218  },   /* capital U, acute accent */
101 {    "Ucirc",  219  },   /* capital U, circumflex accent */
102 {    "Uuml",        220  },   /* capital U, dieresis or umlaut mark */
103 {    "Yacute", 221  },   /* capital Y, acute accent */
104 {    "THORN",  222  },   /* capital THORN, Icelandic */
105 {    "szlig",  223  },   /* small sharp s, German (sz ligature) */
106 {    "agrave", 224  },   /* small a, grave accent */
107 {    "aacute", 225  },   /* small a, acute accent */
108 {    "acirc",  226  },   /* small a, circumflex accent */
109 {    "atilde", 227  },   /* small a, tilde */
110 {    "auml",        228  },   /* small a, dieresis or umlaut mark */
111 {    "aring",  229  },   /* small a, ring */
112 {    "aelig",  230  },   /* small ae diphthong (ligature) */
113 {    "ccedil", 231  },   /* small c, cedilla */
114 {    "egrave", 232  },   /* small e, grave accent */
115 {    "eacute", 233  },   /* small e, acute accent */
116 {    "ecirc",  234  },   /* small e, circumflex accent */
117 {    "euml",        235  },   /* small e, dieresis or umlaut mark */
118 {    "igrave", 236  },   /* small i, grave accent */
119 {    "iacute", 237  },   /* small i, acute accent */
120 {    "icirc",  238  },   /* small i, circumflex accent */
121 {    "iuml",        239  },   /* small i, dieresis or umlaut mark */
122 {    "eth",         240  },   /* small eth, Icelandic */
123 {    "ntilde", 241  },   /* small n, tilde */
124 {    "ograve", 242  },   /* small o, grave accent */
125 {    "oacute", 243  },   /* small o, acute accent */
126 {    "ocirc",  244  },   /* small o, circumflex accent */
127 {    "otilde", 245  },   /* small o, tilde */
128 {    "ouml",        246  },   /* small o, dieresis or umlaut mark */
129 {    "divide", 247  },   /* division sign */
130 {    "oslash", 248  },   /* small o, slash */
131 {    "ugrave", 249  },   /* small u, grave accent */
132 {    "uacute", 250  },   /* small u, acute accent */
133 {    "ucirc",  251  },   /* small u, circumflex accent */
134 {    "uuml",        252  },   /* small u, dieresis or umlaut mark */
135 {    "yacute", 253  },   /* small y, acute accent */
136 {    "thorn",  254  },   /* small thorn, Icelandic */
137 {    "yuml",        255  },   /* small y, dieresis or umlaut mark */
138 
139 /* Latin Extended-A */
140 
141 {    "OElig",  338  },
142 {    "oelig",  339  },
143 {    "Scaron", 352  },
144 {    "scaron", 353  },
145 {    "Yuml",        376  },
146 
147 /* Latin-Extended-B */
148 
149 {    "fnof",        402  },
150 
151 
152 /* Accents */
153 
154 {    "circ",        710  },
155 {    "tilde",  732  },
156 
157 /* Greek */
158 
159 {    "Alpha",  913  },
160 {    "Beta",        914  },
161 {    "Gamma",  915  },
162 {    "Delta",  916  },
163 {    "Epsilon",     917  },
164 {    "Zeta",        918  },
165 {    "Eta",         919  },
166 {    "Theta",  920  },
167 {    "Iota",        921  },
168 {    "Kappa",  922  },
169 {    "Lambda", 923  },
170 {    "Mu",          924  },
171 {    "Nu",          925  },
172 {    "Xi",          926  },
173 {    "Omicron",     927  },
174 {    "Pi",          928  },
175 {    "Rho",         929  },
176 {    "Sigma",  931  },
177 {    "Tau",         932  },
178 {    "Upsilon",     933  },
179 {    "Phi",         934  },
180 {    "Chi",         935  },
181 {    "Psi",         936  },
182 {    "Omega",  937  },
183 {    "alpha",  945  },
184 {    "beta",        946  },
185 {    "gamma",  947  },
186 {    "delta",  948  },
187 {    "epsilon",     949  },
188 {    "zeta",        950  },
189 {    "eta",         951  },
190 {    "theta",  952  },
191 {    "iota",        953  },
192 {    "kappa",  954  },
193 {    "lambda", 955  },
194 {    "mu",          956  },
195 {    "nu",          957  },
196 {    "xi",          958  },
197 {    "omicron",     959  },
198 {    "pi",          960  },
199 {    "rho",         961  },
200 {    "sigmaf", 962  },
201 {    "sigma",  963  },
202 {    "tau",         964  },
203 {    "upsilon",     965  },
204 {    "phi",         966  },
205 {    "chi",         967  },
206 {    "psi",         968  },
207 {    "omega",  969  },
208 {    "thetasym",    977  },
209 {    "upsih",  978  },
210 {    "piv",         982  },
211 
212 
213 /* Punctuation */
214 
215 {    "ensp",        8194 },   /* en space */
216 {    "emsp",        8195 },   /* em space */
217 {    "thinsp", 8201 },
218 {    "zwnj",        8204 },   /* zero width non-joiner */
219 {    "zwj",         8205 },   /* zero width joiner */
220 {    "lrm",         8206 },   /* left-to-right mark */
221 {    "rlm",         8207 },   /* right-to-left mark */
222 {    "ndash",  8211 },   /* en dash */
223 {    "mdash",  8212 },   /* em dash */
224 {    "lsquo",  8216 },
225 {    "rsquo",  8217 },
226 {    "sbquo",  8218 },
227 {    "ldquo",  8220 },
228 {    "rdquo",  8221 },
229 {    "bdquo",  8222 },
230 {    "dagger", 8224 },
231 {    "Dagger", 8225 },
232 {    "bull",        8226 },
233 {    "hellip", 8230 },
234 {    "permil", 8240 },
235 {    "prime",  8242 },
236 {    "Prime",  8243 },
237 {    "lsaquo", 8249 },
238 {    "rsaquo", 8250 },
239 {    "oline",  8254 },
240 {    "frasl",  8260 },
241 {    "euro",        8364 },
242 
243 
244 /* Letter type characters */
245 
246 {    "weierp", 8472 },
247 {    "image",  8465 },
248 {    "real",        8476 },
249 {    "trade",  8482 },
250 {    "alefsym",     8501 },
251 
252 /* Arrows */
253 
254 {    "larr",        8592 },
255 {    "uarr",        8593 },
256 {    "rarr",        8594 },
257 {    "darr",        8595 },
258 {    "harr",        8596 },
259 {    "crarr",  8629 },
260 {    "lArr",        8656 },
261 {    "uArr",        8657 },
262 {    "rArr",        8658 },
263 {    "dArr",        8659 },
264 {    "hArr",        8660 },
265 
266 /* Math characters */
267 
268 {    "forall", 8704 },
269 {    "part",        8706 },
270 {    "exist",  8707 },
271 {    "empty",  8709 },
272 {    "nabla",  8711 },
273 {    "isin",        8712 },
274 {    "notin",  8713 },
275 {    "ni",          8715 },
276 {    "prod",        8719 },
277 {    "sum",         8721 },
278 {    "minus",  8722 },
279 {    "lowast", 8727 },
280 {    "radic",  8730 },
281 {    "prop",        8733 },
282 {    "infin",  8734 },
283 {    "ang",         8736 },
284 {    "and",         8743 },
285 {    "or",          8744 },
286 {    "cap",         8745 },
287 {    "cup",         8746 },
288 {    "int",         8747 },
289 {    "there4", 8756 },
290 {    "sim",         8764 },
291 {    "cong",        8773 },
292 {    "asymp",  8776 },
293 {    "ne",          8800 },
294 {    "equiv",  8801 },
295 {    "le",          8804 },
296 {    "ge",          8805 },
297 {    "sub",         8834 },
298 {    "sup",         8835 },
299 {    "nsub",        8836 },
300 {    "sube",        8838 },
301 {    "supe",        8839 },
302 {    "oplus",  8853 },
303 {    "otimes", 8855 },
304 {    "perp",        8869 },
305 {    "sdot",        8901 },
306 
307 /* Misc tech characters */
308 
309 {    "lceil",  8968 },
310 {    "rceil",  8969 },
311 {    "lfloor", 8970 },
312 {    "rfloor", 8971 },
313 {    "lang",        9001 },
314 {    "rang",        9002 },
315 {    "loz",         9674 },
316 {    "spades", 9824 },
317 {    "clubs",  9827 },
318 {    "hearts", 9829 },
319 {    "diams",  9830 },
320 
321 /* END Marker */
322 
323 {    "",       0    }};
324 
325 
UdmSgmlToUni(const char * sgml)326 udm_wc_t UdmSgmlToUni(const char *sgml)
327 {
328   const struct udm_sgml_chars *p;
329   for(p= SGMLChars; p->unicode; p++)
330   {
331     const char *s, *t;
332     for (s= sgml, t= p->sgml; *s == *t ; s++, t++);
333     if (!*t)
334       return p->unicode;
335   }
336   return 0;
337 }
338 
339 
340 /*
341   Scan an entity. Return number of bytes scanned.
342 */
UdmSGMLScan(udm_wc_t * wc,const unsigned char * str,const unsigned char * end)343 int UdmSGMLScan(udm_wc_t *wc, const unsigned char *str, const unsigned char *end)
344 {
345   const unsigned char *p, *end10= str + 10;
346   if (end10 > end)
347     end10= end;
348   for (p= str + 2; p < end10; p++)
349   {
350     if (*p == ';')
351     {
352       if (str[1] == '#')
353       {
354         if (str[2] == 'x' || str[2] == 'X')
355           *wc= strtoul((const char*)str + 3, NULL, 16);
356         else
357           *wc= strtoul((const char*)str + 2, NULL, 10);
358         if (*wc > 0x10FFFF)
359           *wc= '?';
360       }
361       else
362       {
363         *wc= UdmSgmlToUni((const char*) str + 1);
364       }
365       if (*wc)
366         return p - str + 1;
367     }
368   }
369   *wc= '&';
370   return 1;
371 }
372 
373 
374 /** This function replaces SGML entities
375     With their character equivalents
376 */
377 
378 #define UDM_MAX_SGML_LEN 20
379 
380 UDM_API(char *)
UdmSGMLUnescape(char * str)381 UdmSGMLUnescape(char * str)
382 {
383   char *s= str;
384 
385   while  (*s)
386   {
387     if (*s == '&')
388     {
389       if (s[1] == '#')
390       {
391         char *e;
392         for(e= s + 2; (e - s < UDM_MAX_SGML_LEN) &&
393                       (*e <= '9') && (*e >= '0'); e++);
394         if(*e == ';')
395         {
396           int v= atoi(s + 2);
397           if (v >= 0 && v <= 255)
398           {
399             *s= (char) v;
400           }
401           else
402           {
403             *s = ' ';
404           }
405           memmove(s + 1, e + 1, strlen(e + 1) + 1);
406         }
407       }
408       else
409       {
410         char *e, c;
411         for (e= s + 1; (e - s < UDM_MAX_SGML_LEN) &&
412                        (((*e <= 'z') && (*e >= 'a'))||
413                         ((*e <= 'Z') && (*e >= 'A')));e++);
414 
415         if ((*e == ';') && (c= (char) UdmSgmlToUni(s + 1)))
416         {
417           *s= c;
418           memmove(s + 1, e + 1, strlen(e + 1) + 1);
419         }
420       }
421     }
422     s++;
423   }
424   return str;
425 }
426 
427 
428 /** This function replaces SGML entities
429     With their UNICODE   equivalents
430 */
UdmSGMLUniUnescape(int * ustr)431 void UdmSGMLUniUnescape(int * ustr)
432 {
433   int *s= ustr;
434 
435   for ( ; *s; s++)
436   {
437     if (*s == '&')
438     {
439       char sgml[UDM_MAX_SGML_LEN + 1];
440       int i= 0;
441       if (s[1] == '#')
442       {
443         int *e;
444         for (e= s + 2; (e - s < UDM_MAX_SGML_LEN) &&
445                        (*e <= '9') && (*e >= '0'); e++);
446         if(*e == ';')
447         {
448           for(i= 2; s + i < e; i++)
449             sgml[i - 2]= s[i];
450           sgml[i - 2] = '\0';
451           *s= atoi(sgml);
452           memmove(s + 1, e + 1, sizeof(int) * (UdmUniLen(e + 1) + 1));
453         }
454       }
455       else
456       {
457         int c, *e;
458         for(e= s + 1; (e - s < UDM_MAX_SGML_LEN)&&
459                       (((*e <= 'z') && (*e >= 'a'))||
460                        ((*e <= 'Z') && (*e >= 'A'))); e++)
461         {
462           sgml[i]= (char) *e;
463           i++;
464         }
465         sgml[i]= 0;
466         if((*e == ';') && (c= UdmSgmlToUni(sgml)))
467         {
468           *s= c;
469           memmove(s + 1, e + 1, sizeof(int) * (UdmUniLen(e + 1) + 1));
470         }
471       }
472     }
473   }
474 }
475 
476 
UdmHTMLEncode(char * dst,size_t dstlen,const char * src,size_t srclen)477 size_t UdmHTMLEncode(char *dst, size_t dstlen, const char *src, size_t srclen)
478 {
479   char *dst0= dst;
480 
481   for ( ; srclen ; srclen--, src++)
482   {
483     const char *ch;
484     size_t chlen;
485     switch (*src)
486     {
487       case '&':
488         ch= "&amp;";
489         chlen= 5;
490         break;
491       case '"':
492         ch= "&quot;";
493         chlen= 6;
494         break;
495       case '<':
496         ch= "&lt;";
497         chlen= 4;
498         break;
499       case '>':
500         ch= "&gt;";
501         chlen= 4;
502         break;
503       default:
504         ch= src;
505         chlen= 1;
506     }
507     if (chlen > dstlen)
508       break;
509     if (chlen == 1)
510       *dst= *ch;
511     else
512       memcpy(dst, ch, chlen);
513     dst+= chlen;
514     dstlen-= chlen;
515   }
516   return dst - dst0;
517 }
518