1 /*
2 * htmlify.c --- convert ISO-8859-1 to HTML and back
3 * nca-073-9
4 *
5 * Copyright (c) 1996-2000 by Netcetera AG.
6 * Copyright (c) 2001 by Apache Software Foundation.
7 * All rights reserved.
8 *
9 * See the file "license.terms" for information on usage and
10 * redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
11 *
12 * @(#) $Id: htmlify.c 814683 2009-09-14 15:11:40Z ronnie $
13 *
14 */
15
16 #include <tcl.h>
17 #include <stdio.h>
18 #include <string.h>
19 #include "webutl.h"
20 #include "conv.h"
21 #include "log.h"
22
23 /* ----------------------------------------------------------------------------
24 * htmlifyAppendNum
25 * ------------------------------------------------------------------------- */
htmlifyAppendNum(Tcl_Obj * tclo,int num)26 void htmlifyAppendNum(Tcl_Obj * tclo, int num)
27 {
28
29 Tcl_Obj *intObj = NULL;
30
31 if (tclo == NULL)
32 return;
33
34 intObj = Tcl_NewIntObj(num);
35
36 if (intObj != NULL) {
37 Tcl_IncrRefCount(intObj);
38 Tcl_AppendToObj(tclo, "&#", 2);
39 Tcl_AppendObjToObj(tclo, intObj);
40 Tcl_DecrRefCount(intObj);
41 Tcl_AppendToObj(tclo, ";", 1);
42 }
43 }
44
45
46 /* ----------------------------------------------------------------------------
47 * webHtmlify -- convert string from ISO-8859-1 to HTML
48 * ------------------------------------------------------------------------- */
webHtmlify(ConvData * convData,Tcl_Obj * in,int useNumeric)49 Tcl_Obj *webHtmlify(ConvData * convData, Tcl_Obj * in, int useNumeric)
50 {
51
52 int iPos = 0;
53 int len = 0;
54 Tcl_UniChar unic = 0;
55 Tcl_Obj *entity = NULL;
56 Tcl_Obj *res = NULL;
57
58 if ((convData == NULL) || (in == NULL))
59 return NULL;
60
61 res = Tcl_NewObj();
62 Tcl_IncrRefCount(res);
63
64 /* ------------------------------------------------------------------------
65 * loop over input string
66 * --------------------------------------------------------------------- */
67 len = Tcl_GetCharLength(in);
68
69 for (iPos = 0; iPos < len; iPos++) {
70
71 unic = Tcl_GetUniChar(in, iPos);
72
73 if (unic == 0)
74 break;
75
76 /* --------------------------------------------------------------------
77 * translation needed ?
78 * ----------------------------------------------------------------- */
79 if (unic <= WEBENC_LATIN_TABLE_LENGTH &&
80 convData->need[unic] == TCL_OK) {
81
82 /* yes */
83
84 if (useNumeric == TCL_OK) {
85
86 /* numeric ? */
87 htmlifyAppendNum(res, unic);
88 }
89 else {
90
91 /* no, entity */
92
93 entity = convData->ute[unic];
94
95 if (entity == NULL) {
96 htmlifyAppendNum(res, unic);
97 }
98 else {
99
100 Tcl_AppendToObj(res, "&", 1);
101 Tcl_AppendObjToObj(res, entity);
102 Tcl_AppendToObj(res, ";", 1);
103 }
104 }
105 }
106 else {
107 if (unic > WEBENC_LATIN_TABLE_LENGTH) {
108 /* numeric translation, because there is no entity
109 for characters > 256 (multibyte character sets */
110 htmlifyAppendNum(res, unic);
111 } else {
112 /* no, no translation needed */
113 Tcl_AppendUnicodeToObj(res, &unic, 1);
114 }
115 }
116 }
117
118 return res;
119 }
120
121 /* ----------------------------------------------------------------------------
122 * Macros for webDeHtmlify
123 *
124 * Note: All macros end with 'pos' on the last position that belonged
125 * to the tag or entity
126 * ------------------------------------------------------------------------- */
127
128 /* <!> */
129 /* <-- */
130 #define HANDLE_TAG(unic,length,out,pos,err) { \
131 int open = 1; /* number of open '<' */ \
132 int begin = pos; \
133 int isCmt = 0; \
134 if( length >= 4 ) \
135 if( (unic[pos+1] == '!') && (unic[pos+2] == '-') && (unic[pos+3] == '-') ) isCmt = 1; \
136 pos++; \
137 while (pos < length ) { \
138 if (unic[pos] == '>') { \
139 if( isCmt ) { \
140 if( (unic[pos-1] == '-') && (unic[pos-2] == '-') ) { \
141 open--; \
142 isCmt = 0; \
143 pos++; \
144 break; \
145 } else { \
146 /* unfinished comment. continue */ \
147 } \
148 } else { \
149 /* end-tag */ \
150 open--; \
151 pos++; \
152 break; \
153 } \
154 } \
155 pos++; \
156 } \
157 pos--; /* to be on last char of tag */ \
158 if (isCmt || open) { \
159 /* unfinished comment. append */ \
160 Tcl_AppendUnicodeToObj(out,&(unic[begin]),pos - begin + 1); \
161 } \
162 }
163
164 #define HANDLE_ENTITY(convData, unic, length, out, pos, err) { \
165 int begin = pos; \
166 int end = ++pos; \
167 int first = end; \
168 int nobreak = 1; \
169 \
170 if (first >= length) { \
171 /* an ampersand at the very last position, just write it */ \
172 Tcl_AppendUnicodeToObj(out,&(unic[begin]),1); \
173 } else { \
174 /* search for end of entity */ \
175 \
176 while (nobreak) { \
177 switch (unic[end]) { \
178 case ';': \
179 pos = end; \
180 nobreak = 0; \
181 break; \
182 case ' ': \
183 pos = end - 1; \
184 nobreak = 0; \
185 break; \
186 case '<': \
187 pos = end - 1; \
188 nobreak = 0; \
189 break; \
190 default : \
191 if (end >= length) { \
192 pos = length - 1; \
193 /*end++; */ \
194 nobreak = 0; \
195 break; \
196 } \
197 end++; \
198 break; \
199 } \
200 } \
201 \
202 if (unic[first] == '#') { \
203 /* a number */ \
204 HANDLE_UNICODE_ENTITY(unic, length, out, begin, first, end, err); \
205 } else { \
206 HANDLE_KEY_ENTITY(convData, unic, length, out, begin, first, end, err); \
207 } \
208 } \
209 }
210
211 #define HANDLE_UNICODE_ENTITY(unic, length, out, begin, first, end, err) { \
212 int tInt = 0; \
213 Tcl_UniChar tmp = 0; \
214 Tcl_Obj* entity; \
215 first ++; \
216 \
217 entity = Tcl_NewUnicodeObj(&(unic[first]),end-first); \
218 Tcl_IncrRefCount(entity); \
219 if( Tcl_GetIntFromObj(NULL,entity,&tInt) == TCL_ERROR ) { \
220 /* no valid number, we write the string instead */ \
221 Tcl_AppendUnicodeToObj(out,&(unic[begin]),end-begin); \
222 err++; \
223 } else { \
224 /* check if within range of Tcl_UniChar */ \
225 if (tInt > 32768 - 1) { \
226 /* no, we write the string instead */ \
227 Tcl_AppendUnicodeToObj(out,&(unic[begin]),end-begin); \
228 if (end < length && unic[end] == ';') /* don't forget this one! */ \
229 Tcl_AppendUnicodeToObj(out,&(unic[end]),1); \
230 err++; \
231 } else { \
232 tmp = (Tcl_UniChar)tInt; \
233 Tcl_AppendUnicodeToObj(out,&tmp,1); \
234 } \
235 } \
236 Tcl_DecrRefCount(entity); \
237 }
238
239 #define HANDLE_KEY_ENTITY(convData, unic, length, out, begin, first, end, err) { \
240 /* use lookup table */ \
241 Tcl_Obj* iObj = NULL; \
242 Tcl_Obj* entity = Tcl_NewUnicodeObj(&(unic[first]),end-first); \
243 Tcl_IncrRefCount(entity); \
244 iObj = (Tcl_Obj *)getFromHashTable(convData->etu, \
245 Tcl_GetString(entity)); \
246 Tcl_DecrRefCount(entity); \
247 \
248 if( iObj != NULL ) { \
249 /* got it in table */ \
250 int tInt = 0; \
251 if( Tcl_GetIntFromObj(NULL,iObj,&tInt) != TCL_ERROR ) { \
252 Tcl_UniChar tmp = (Tcl_UniChar) tInt; \
253 Tcl_AppendUnicodeToObj(out,&tmp,1); \
254 /* don't kill iObj, it's owned by the hashtable */ \
255 } else { \
256 /* we do not have invalid values in the hashtable !*/ \
257 } \
258 } else { \
259 /* not in table, we write the string instead */ \
260 Tcl_AppendUnicodeToObj(out,&(unic[begin]),end-begin); \
261 if (end < length && unic[end] == ';') /* don't forget this one! */ \
262 Tcl_AppendUnicodeToObj(out,&(unic[end]),1); \
263 err++; \
264 } \
265 }
266
267 /* ----------------------------------------------------------------------------
268 * webDeHtmlify -- de-htmlifies input string 'in' and writes to 'out'
269 * ------------------------------------------------------------------------- */
webDeHtmlify(ConvData * convData,Tcl_Obj * in,Tcl_Obj * out)270 int webDeHtmlify(ConvData * convData, Tcl_Obj * in, Tcl_Obj * out)
271 {
272
273 int length; /* length of input */
274 int pos = 0; /* actual position in string */
275 Tcl_UniChar *unic;
276 int plainfirst = 0;
277 int plainend = 0;
278 int err = 0; /* temporary use, may be removed */
279
280 if (in == NULL || out == NULL) {
281 return TCL_ERROR;
282 }
283
284 unic = Tcl_GetUnicode(in);
285 length = Tcl_GetCharLength(in);
286
287 if (length == 1) {
288 if ((unic[0] == '>') || (unic[pos] == '>')) {
289 /* nada */
290 return TCL_OK;
291 }
292 else {
293 Tcl_AppendUnicodeToObj(out, &unic[0], 1);
294 return TCL_OK;
295 }
296 }
297
298 while (pos < length) {
299
300 plainend = pos;
301 if (unic[pos] == '<') {
302
303 /* dump */
304 Tcl_AppendUnicodeToObj(out, &unic[plainfirst],
305 plainend - plainfirst);
306
307 /* ---------------------------------------------------------------------
308 * we're in a tag, thus we skip everything
309 * --------------------------------------------------------------------*/
310 HANDLE_TAG(unic, length, out, pos, err);
311 plainfirst = pos + 1;
312
313 }
314 else if (unic[pos] == '>') {
315
316 /* dump */
317 Tcl_AppendUnicodeToObj(out, &unic[plainfirst],
318 plainend - plainfirst);
319 /* syntax error, too many closing '>' */
320 Tcl_AppendUnicodeToObj(out, &(unic[pos]), 1);
321 /*Tcl_SetStringObj(out,"Error: web::dehtmlify, unbalanced '>'",-1); */
322 plainfirst = pos + 1;
323
324 }
325 else if (unic[pos] == '&') {
326
327 /* dump */
328 Tcl_AppendUnicodeToObj(out, &unic[plainfirst],
329 plainend - plainfirst);
330 /*
331 * it's an entity
332 */
333 HANDLE_ENTITY(convData, unic, length, out, pos, err);
334 plainfirst = pos + 1;
335 }
336
337 /* ------------------------------------------------------------------------
338 * search on
339 * ----------------------------------------------------------------------*/
340 pos++;
341 }
342 /* final dump */
343 if (plainend >= plainfirst && plainend > 0) {
344 Tcl_AppendUnicodeToObj(out, &unic[plainfirst],
345 plainend - plainfirst + 1);
346 }
347
348 return TCL_OK;
349 }
350
351 /* ----------------------------------------------------------------------------
352 * findCmtClose
353 * ------------------------------------------------------------------------- */
findHtmlCmtClose(TCLCONST char * utf)354 TCLCONST char *findHtmlCmtClose(TCLCONST char *utf)
355 {
356
357 TCLCONST char *cmtclose = NULL;
358 TCLCONST char *next1 = NULL;
359 TCLCONST char *next2 = NULL;
360
361 if (utf == NULL)
362 return NULL;
363
364 while ((cmtclose = Tcl_UtfFindFirst(utf, '-')) != NULL) {
365
366 next1 = NULL;
367 next2 = NULL;
368
369 next1 = Tcl_UtfNext(cmtclose);
370 if (next1 != NULL)
371 next2 = Tcl_UtfNext(next1);
372
373 if ((next1[0] == '-') && (next2[0] == '>')) {
374 return next2;
375 }
376 utf = Tcl_UtfNext(cmtclose);
377 }
378 return NULL;
379 }
380
381 /* ----------------------------------------------------------------------------
382 * removeHtmlComments --
383 * Scans inString for HTML comments. Upon completion, inString will
384 * contain the input minus all HTML comments.
385 * ------------------------------------------------------------------------- */
removeHtmlComments(Tcl_Interp * interp,Tcl_Obj * in,Tcl_Obj * res)386 int removeHtmlComments(Tcl_Interp * interp, Tcl_Obj * in, Tcl_Obj * res)
387 {
388
389 int len = 0;
390 TCLCONST char *utf = NULL;
391 TCLCONST char *cmtopen = NULL;
392 TCLCONST char *cmtclose = NULL;
393 TCLCONST char *next1 = NULL;
394 TCLCONST char *next2 = NULL;
395 TCLCONST char *next3 = NULL;
396
397 if ((in == NULL) || (res == NULL))
398 return TCL_ERROR;
399
400 utf = Tcl_GetStringFromObj(in, &len);
401
402 if (len == 0)
403 return TCL_OK;
404
405 /* --------------------------------------------------------------------------
406 * fast forward to first "<"
407 * ----------------------------------------------------------------------- */
408 while ((cmtopen = Tcl_UtfFindFirst(utf, '<')) != NULL) {
409
410 next1 = NULL;
411 next2 = NULL;
412 next3 = NULL;
413
414 next1 = Tcl_UtfNext(cmtopen);
415 if (next1 != NULL)
416 next2 = Tcl_UtfNext(next1);
417 if (next2 != NULL)
418 next3 = Tcl_UtfNext(next2);
419
420 if (next1[0] == '!') {
421 /* ----------------------------------------------------------------------
422 * starts like a comment.
423 * ------------------------------------------------------------------- */
424 if ((next2[0] == '-') && (next3[0] == '-')) {
425 Tcl_AppendToObj(res, utf, cmtopen - utf);
426 cmtclose = findHtmlCmtClose(Tcl_UtfNext(next3));
427 if (cmtclose == NULL) {
428 Tcl_AppendToObj(res, cmtopen, -1);
429 LOG_MSG(interp, WRITE_LOG, __FILE__, __LINE__,
430 "removeHtmlComments", WEBLOG_INFO,
431 "end of string encountered while searching for comment-end",
432 NULL);
433 return TCL_OK;
434 }
435 else {
436 utf = Tcl_UtfNext(cmtclose);
437 }
438 }
439 else {
440
441 if (next2[0] == '>') {
442 Tcl_AppendToObj(res, utf, cmtopen - utf);
443 utf = next3;
444 }
445 else {
446
447 Tcl_AppendToObj(res, utf, cmtopen - utf + 1);
448 utf = next1;
449 }
450 }
451 }
452 else {
453 /* ----------------------------------------------------------------------
454 * not a comment. proceed.
455 * ------------------------------------------------------------------- */
456 Tcl_AppendToObj(res, utf, cmtopen - utf + 1);
457 utf = next1;
458 }
459 }
460 /* ------------------------------------------------------------------------
461 * append rest
462 * --------------------------------------------------------------------- */
463 if (utf != NULL)
464 Tcl_AppendToObj(res, utf, -1);
465
466 return TCL_OK;
467 }
468