1 #include "catutil.h"
2
3 #include <stdio.h>
4 #include <string.h>
5 #include <ctype.h>
6
7 #include "charset.h"
8 #include "string16.h"
9 #include "stdio16.h"
10 #include "rxputil.h"
11
12 static char *norm_pub(const char8 *public8, const char16 *public16);
13 static char *norm_sys(const char8 *system8, const char16 *system16);
14
NormalizePublic(const Char * public)15 char *NormalizePublic(const Char *public)
16 {
17 #if CHAR_SIZE == 8
18 return NormalizePublic8(public);
19 #else
20 return NormalizePublic16(public);
21 #endif
22 }
23
NormalizePublic16(const char16 * public)24 char *NormalizePublic16(const char16 *public)
25 {
26 return norm_pub(0, public);
27 }
28
NormalizePublic8(const char8 * public)29 char *NormalizePublic8(const char8 *public)
30 {
31 return norm_pub(public, 0);
32 }
33
norm_pub(const char8 * public8,const char16 * public16)34 char *norm_pub(const char8 *public8, const char16 *public16)
35 {
36 int len = public8 ? strlen(public8) : strlen16(public16);
37 int i, j, in_space;
38 char *new_public;
39
40 if(!(new_public = Malloc(len+1)))
41 return 0;
42
43 in_space = 1;
44 for(i=j=0; i<len; i++)
45 {
46 int c = public8 ? (unsigned char)public8[i] : public16[i];
47 if(c > 127)
48 {
49 if(public8)
50 Fprintf(Stderr,
51 "catalog error: non-ascii character in public id %s\n",
52 public8);
53 else
54 Fprintf(Stderr,
55 "catalog error: non-ascii character in public id %ls\n",
56 public16);
57
58 Free(new_public);
59 return 0;
60 }
61 if(c == ' ' || c == '\t' || c == '\r' || c == '\n')
62 {
63 if(!in_space)
64 new_public[j++] = ' ';
65 in_space = 1;
66 }
67 else
68 {
69
70 new_public[j++] = c;
71 in_space = 0;
72 }
73 }
74
75 while(j > 0)
76 {
77 int c = new_public[j-1];
78 if(c == ' ' || c == '\t' || c == '\r' || c == '\n')
79 j--;
80 else
81 break;
82 }
83
84 new_public[j] = 0;
85
86 return new_public;
87 }
88
NormalizeSystem(const Char * system)89 char *NormalizeSystem(const Char *system)
90 {
91 #if CHAR_SIZE == 8
92 return NormalizeSystem8(system);
93 #else
94 return NormalizeSystem16(system);
95 #endif
96 }
97
NormalizeSystem16(const char16 * system)98 char *NormalizeSystem16(const char16 *system)
99 {
100 return norm_sys(0, system);
101 }
102
NormalizeSystem8(const char8 * system)103 char *NormalizeSystem8(const char8 *system)
104 {
105 return norm_sys(system, 0);
106 }
107
norm_sys(const char8 * system8,const char16 * system16)108 char *norm_sys(const char8 *system8, const char16 *system16)
109 {
110 int len = system8 ? strlen(system8) : strlen16(system16);
111 int i, j;
112 int c;
113 Vector(char, new_system);
114 char escbuf[13]; /* up to 4 UTF-8 bytes * 3 + null */
115 char *p;
116
117 VectorInit(new_system);
118
119 for(i=j=0; i<len; i++)
120 {
121 c = system8 ? (unsigned char)system8[i] : system16[i];
122
123 if(c > 0x110000)
124 {
125 /* shouldn't happen if it came from an XML document */
126 Fprintf(Stderr,
127 "catalog error: unicode character u+%x > u+110000\n", c);
128 return 0;
129 }
130 else if(c >= 0xd800 && c <= 0xdbff)
131 {
132 /* surrogates */
133
134 int d;
135 if(i == len)
136 {
137 Fprintf(Stderr,
138 "catalog error: unterminated surrogate pair\n", c);
139 return 0;
140 }
141 d = system8 ? (unsigned char)system8[++i] : system16[++i];
142 if(d < 0xdc00 || d > 0xdfff)
143 {
144 Fprintf(Stderr,
145 "catalog error: unterminated surrogate pair\n", c);
146 return 0;
147 }
148 percent_escape(0x10000 + ((c - 0xd800) << 10) + (d - 0xdc00),
149 escbuf);
150 }
151 else if(c >= 0xdc00 && c <= 0xdfff)
152 {
153 /* bogus surrogates */
154
155 Fprintf(Stderr, "catalog error: bad first surrogate u+%x\n", c);
156 return 0;
157 }
158 else if(c < 0x20 || c >= 0x80)
159 {
160 /* controls and non-ascii */
161
162 percent_escape(c, escbuf);
163 }
164 else
165 {
166 /* excluded ascii characters */
167
168 switch(c)
169 {
170 case ' ':
171 case '<':
172 case '>':
173 case '\\':
174 case '^':
175 case '`':
176 case '{':
177 case '|':
178 case '}':
179 case 127:
180 percent_escape(c, escbuf);
181 break;
182 default:
183 if(!VectorPush(new_system, c))
184 return 0;
185 continue;
186 break;
187 }
188 }
189
190 /* copy the escaped characters */
191
192 for(p = escbuf; *p; p++)
193 if(!VectorPush(new_system, *p))
194 return 0;
195 }
196
197 if(!VectorPush(new_system, 0))
198 return 0;
199
200 return new_system;
201 }
202
toUTF8(int c,int * bytes)203 int toUTF8(int c, int *bytes)
204 {
205 if(c < 0)
206 return -1;
207
208 if(c < 0x80)
209 {
210 bytes[0] = c;
211 return 1;
212 }
213
214 if(c < 0x800)
215 {
216 bytes[0] = 0xc0 + (c >> 6);
217 bytes[1] = 0x80 + (c & 0x3f);
218 return 2;
219 }
220
221 if(c < 0x10000)
222 {
223 bytes[0] = 0xe0 + (c >> 12);
224 bytes[1] = 0x80 + ((c >> 6) & 0x3f);
225 bytes[2] = 0x80 + (c & 0x3f);
226 return 3;
227 }
228
229 if(c < 0x200000)
230 {
231 bytes[0] = 0xf0 + (c >> 18);
232 bytes[1] = 0x80 + ((c >> 12) & 0x3f);
233 bytes[2] = 0x80 + ((c >> 6) & 0x3f);
234 bytes[3] = 0x80 + (c & 0x3f);
235 return 4;
236 }
237
238 if(c < 0x4000000)
239 {
240 bytes[0] = 0xf8 + (c >> 24);
241 bytes[1] = 0x80 + ((c >> 18) & 0x3f);
242 bytes[2] = 0x80 + ((c >> 12) & 0x3f);
243 bytes[3] = 0x80 + ((c >> 6) & 0x3f);
244 bytes[4] = 0x80 + (c & 0x3f);
245 return 5;
246 }
247
248 /* if(c < 0x80000000) always true! */
249 {
250 bytes[0] = 0xfc + (c >> 30);
251 bytes[1] = 0x80 + ((c >> 24) & 0x3f);
252 bytes[2] = 0x80 + ((c >> 18) & 0x3f);
253 bytes[3] = 0x80 + ((c >> 12) & 0x3f);
254 bytes[4] = 0x80 + ((c >> 6) & 0x3f);
255 bytes[5] = 0x80 + (c & 0x3f);
256 return 6;
257 }
258 }
259
percent_escape(int c,char * buf)260 int percent_escape(int c, char *buf)
261 {
262 int nbytes, i;
263 int bytes[6];
264
265 if((nbytes = toUTF8(c, bytes)) == -1)
266 return -1;
267
268 for(i=0; i<nbytes; i++)
269 {
270 /* XXX upper case?? */
271 sprintf(buf, "%%%2x", bytes[i]);
272 buf += 3;
273 }
274
275 *buf = 0;
276
277 return nbytes * 3;
278 }
279
IsPublicidUrn(const char * id)280 int IsPublicidUrn(const char *id)
281 {
282 #if 1
283 return id && strncasecmp(id, "urn:publicid:", 13) == 0;
284 #else
285 /* guess who doesn't provide strncasecmp */
286 static const char *p = "urn:publicid:";
287 int i;
288
289 if(!id)
290 return 0;
291 for(i=0; p[i]; i++)
292 if(tolower(id[i]) != p[i])
293 return 0;
294
295 return 1;
296 #endif
297 }
298
UnwrapPublicidUrn(const char * id)299 char *UnwrapPublicidUrn(const char *id)
300 {
301 int i, j, len, extra = 0;
302 char *result;
303
304 id += 13; /* skip over urn:publicid: */
305
306 for(i=0; id[i]; i++)
307 if(id[i] == ':' || id[i] == ';')
308 extra++;
309 len = i;
310
311 if(!(result = Malloc(len + extra + 1)))
312 return 0;
313
314 for(i=j=0; i<len; i++)
315 {
316 switch(id[i])
317 {
318 case '+':
319 result[j++] = ' ';
320 break;
321 case ':':
322 result[j++] = '/';
323 result[j++] = '/';
324 break;
325 case ';':
326 result[j++] = ':';
327 result[j++] = ':';
328 break;
329 case '%':
330 if(id[i+1] == '2' && (id[i+2] == 'B' || id[i+2] == 'b'))
331 result[j++] = '+';
332 else if(id[i+1] == '3' && (id[i+2] == 'A' || id[i+2] == 'a'))
333 result[j++] = ':';
334 else if(id[i+1] == '2' && (id[i+2] == 'F' || id[i+2] == 'f'))
335 result[j++] = '/';
336 else if(id[i+1] == '3' && (id[i+2] == 'B' || id[i+2] == 'b'))
337 result[j++] = ';';
338 else if(id[i+1] == '2' && id[i+2] == '7')
339 result[j++] = '\'';
340 else if(id[i+1] == '3' && (id[i+2] == 'F' || id[i+2] == 'f'))
341 result[j++] = '?';
342 else if(id[i+1] == '2' && id[i+2] == '3')
343 result[j++] = '#';
344 else if(id[i+1] == '2' && id[i+2] == '5')
345 result[j++] = '%';
346 else
347 {
348 result[j++] = id[i];
349 break;
350 }
351 i += 2;
352 break;
353 default:
354 result[j++] = id[i];
355 break;
356 }
357 }
358
359 result[j] = 0;
360
361 return result;
362 }
363
strcmpC8(const Char * s1,const char * s2)364 int strcmpC8(const Char *s1, const char *s2)
365 {
366 Char c1;
367 char c2;
368
369 while(1)
370 {
371 c1 = *s1++;
372 c2 = (unsigned char)*s2++;
373 if(c1 == 0 && c2 == 0)
374 return 0;
375 if(c1 == 0)
376 return -1;
377 if(c2 == 0)
378 return 1;
379 if(c1 < c2)
380 return -1;
381 if(c1 > c2)
382 return 1;
383 }
384 }
385
386