1 /*
2  * htmlify.c --- convert ISO-8859-1 to HTML and back
3  * nca-073-9
4  *
5  * Copyright (c) 1996-2000 by Netcetera AG.
6  * Copyright (c) 2001 by Apache Software Foundation.
7  * All rights reserved.
8  *
9  * See the file "license.terms" for information on usage and
10  * redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
11  *
12  * @(#) $Id: htmlify.c 814683 2009-09-14 15:11:40Z ronnie $
13  *
14  */
15 
16 #include <tcl.h>
17 #include <stdio.h>
18 #include <string.h>
19 #include "webutl.h"
20 #include "conv.h"
21 #include "log.h"
22 
23 /* ----------------------------------------------------------------------------
24  * htmlifyAppendNum
25  * ------------------------------------------------------------------------- */
htmlifyAppendNum(Tcl_Obj * tclo,int num)26 void htmlifyAppendNum(Tcl_Obj * tclo, int num)
27 {
28 
29     Tcl_Obj *intObj = NULL;
30 
31     if (tclo == NULL)
32 	return;
33 
34     intObj = Tcl_NewIntObj(num);
35 
36     if (intObj != NULL) {
37 	Tcl_IncrRefCount(intObj);
38 	Tcl_AppendToObj(tclo, "&#", 2);
39 	Tcl_AppendObjToObj(tclo, intObj);
40 	Tcl_DecrRefCount(intObj);
41 	Tcl_AppendToObj(tclo, ";", 1);
42     }
43 }
44 
45 
46 /* ----------------------------------------------------------------------------
47  * webHtmlify -- convert string from ISO-8859-1 to HTML
48  * ------------------------------------------------------------------------- */
webHtmlify(ConvData * convData,Tcl_Obj * in,int useNumeric)49 Tcl_Obj *webHtmlify(ConvData * convData, Tcl_Obj * in, int useNumeric)
50 {
51 
52     int iPos = 0;
53     int len = 0;
54     Tcl_UniChar unic = 0;
55     Tcl_Obj *entity = NULL;
56     Tcl_Obj *res = NULL;
57 
58     if ((convData == NULL) || (in == NULL))
59 	return NULL;
60 
61     res = Tcl_NewObj();
62     Tcl_IncrRefCount(res);
63 
64     /* ------------------------------------------------------------------------
65      * loop over input string
66      * --------------------------------------------------------------------- */
67     len = Tcl_GetCharLength(in);
68 
69     for (iPos = 0; iPos < len; iPos++) {
70 
71 	unic = Tcl_GetUniChar(in, iPos);
72 
73 	if (unic == 0)
74 	    break;
75 
76 	/* --------------------------------------------------------------------
77 	 * translation needed ?
78 	 * ----------------------------------------------------------------- */
79 	if (unic <= WEBENC_LATIN_TABLE_LENGTH &&
80 	    convData->need[unic] == TCL_OK) {
81 
82 	    /* yes */
83 
84 	    if (useNumeric == TCL_OK) {
85 
86 		/* numeric ? */
87 		htmlifyAppendNum(res, unic);
88 	    }
89 	    else {
90 
91 		/* no, entity */
92 
93 		entity = convData->ute[unic];
94 
95 		if (entity == NULL) {
96 		    htmlifyAppendNum(res, unic);
97 		}
98 		else {
99 
100 		    Tcl_AppendToObj(res, "&", 1);
101 		    Tcl_AppendObjToObj(res, entity);
102 		    Tcl_AppendToObj(res, ";", 1);
103 		}
104 	    }
105 	}
106 	else {
107 	  if (unic > WEBENC_LATIN_TABLE_LENGTH) {
108 	    /* numeric translation, because there is no entity
109 	       for characters > 256 (multibyte character sets */
110 	    htmlifyAppendNum(res, unic);
111 	  } else {
112 	    /* no, no translation needed */
113 	    Tcl_AppendUnicodeToObj(res, &unic, 1);
114 	  }
115 	}
116     }
117 
118     return res;
119 }
120 
121 /* ----------------------------------------------------------------------------
122  * Macros for webDeHtmlify
123  *
124  * Note: All macros end with 'pos' on the last position that belonged
125  *       to the tag or entity
126  * ------------------------------------------------------------------------- */
127 
128 /* <!> */
129 /* <-- */
130 #define HANDLE_TAG(unic,length,out,pos,err) { \
131   int open = 1;  /* number of open '<' */ \
132   int begin = pos; \
133   int isCmt = 0; \
134   if( length >= 4 ) \
135     if( (unic[pos+1] == '!') && (unic[pos+2] == '-') && (unic[pos+3] == '-') ) isCmt = 1; \
136   pos++; \
137   while (pos < length ) { \
138     if (unic[pos] == '>') { \
139       if( isCmt ) { \
140 	if( (unic[pos-1] == '-') && (unic[pos-2] == '-') ) { \
141           open--; \
142           isCmt = 0; \
143 	  pos++; \
144 	  break; \
145 	} else { \
146 	  /* unfinished comment. continue */ \
147 	} \
148       } else { \
149 	/* end-tag */ \
150         open--; \
151 	pos++; \
152 	break; \
153       } \
154     } \
155     pos++; \
156   } \
157   pos--; /* to be on last char of tag */ \
158   if (isCmt || open) { \
159     /* unfinished comment. append */ \
160     Tcl_AppendUnicodeToObj(out,&(unic[begin]),pos - begin + 1); \
161   } \
162 }
163 
164 #define HANDLE_ENTITY(convData, unic, length, out, pos, err) { \
165   int begin = pos; \
166   int end = ++pos; \
167   int first = end; \
168   int nobreak = 1; \
169    \
170   if (first >= length) {  \
171     /* an ampersand at the very last position, just write it */ \
172     Tcl_AppendUnicodeToObj(out,&(unic[begin]),1); \
173   } else { \
174     /* search for end of entity */ \
175 \
176 	     while (nobreak) { \
177 	       switch (unic[end]) { \
178 	       case ';': \
179 		 pos = end; \
180 		 nobreak = 0; \
181 		 break; \
182 	       case ' ': \
183                  pos = end - 1; \
184 		 nobreak = 0; \
185 		 break; \
186 	       case '<': \
187                  pos = end - 1; \
188 		 nobreak = 0; \
189 		 break; \
190 	       default : \
191 		 if (end >= length) { \
192                    pos = length - 1; \
193 				       /*end++; */ \
194 		   nobreak = 0; \
195 		   break; \
196 		 } \
197 		 end++; \
198                  break; \
199 	       } \
200 	     } \
201      \
202     if (unic[first] == '#') { \
203       /* a number */ \
204       HANDLE_UNICODE_ENTITY(unic, length, out, begin, first, end, err); \
205     } else { \
206       HANDLE_KEY_ENTITY(convData, unic, length, out, begin, first, end, err); \
207     } \
208   } \
209 }
210 
211 #define HANDLE_UNICODE_ENTITY(unic, length, out, begin, first, end, err) { \
212   int tInt = 0; \
213   Tcl_UniChar tmp = 0; \
214   Tcl_Obj* entity; \
215   first ++; \
216    \
217   entity = Tcl_NewUnicodeObj(&(unic[first]),end-first); \
218   Tcl_IncrRefCount(entity); \
219   if( Tcl_GetIntFromObj(NULL,entity,&tInt) == TCL_ERROR ) { \
220     /* no valid number, we write the string instead */ \
221     Tcl_AppendUnicodeToObj(out,&(unic[begin]),end-begin); \
222     err++; \
223   } else { \
224     /* check if within range of Tcl_UniChar */ \
225     if (tInt > 32768 - 1) { \
226       /* no, we write the string instead */ \
227       Tcl_AppendUnicodeToObj(out,&(unic[begin]),end-begin); \
228       if (end < length && unic[end] == ';') /* don't forget this one! */ \
229 	Tcl_AppendUnicodeToObj(out,&(unic[end]),1); \
230       err++; \
231     } else { \
232       tmp = (Tcl_UniChar)tInt; \
233       Tcl_AppendUnicodeToObj(out,&tmp,1); \
234     } \
235   } \
236   Tcl_DecrRefCount(entity); \
237 }
238 
239 #define HANDLE_KEY_ENTITY(convData, unic, length, out, begin, first, end, err) { \
240   /* use lookup table */ \
241   Tcl_Obj* iObj = NULL; \
242   Tcl_Obj* entity = Tcl_NewUnicodeObj(&(unic[first]),end-first); \
243   Tcl_IncrRefCount(entity); \
244   iObj   = (Tcl_Obj *)getFromHashTable(convData->etu, \
245 				      Tcl_GetString(entity)); \
246   Tcl_DecrRefCount(entity); \
247    \
248   if( iObj != NULL ) { \
249     /* got it in table */ \
250     int tInt = 0; \
251     if( Tcl_GetIntFromObj(NULL,iObj,&tInt) != TCL_ERROR ) { \
252       Tcl_UniChar tmp = (Tcl_UniChar) tInt; \
253       Tcl_AppendUnicodeToObj(out,&tmp,1); \
254       /* don't kill iObj, it's owned by the hashtable */ \
255     } else { \
256       /* we do not have invalid values in the hashtable !*/ \
257     } \
258   } else { \
259     /* not in table, we write the string instead */ \
260     Tcl_AppendUnicodeToObj(out,&(unic[begin]),end-begin); \
261     if (end < length && unic[end] == ';') /* don't forget this one! */ \
262       Tcl_AppendUnicodeToObj(out,&(unic[end]),1); \
263     err++; \
264   }	 \
265 }
266 
267 /* ----------------------------------------------------------------------------
268  *  webDeHtmlify -- de-htmlifies input string 'in' and writes to 'out'
269  * ------------------------------------------------------------------------- */
webDeHtmlify(ConvData * convData,Tcl_Obj * in,Tcl_Obj * out)270 int webDeHtmlify(ConvData * convData, Tcl_Obj * in, Tcl_Obj * out)
271 {
272 
273     int length;			/* length of input */
274     int pos = 0;		/* actual position in string */
275     Tcl_UniChar *unic;
276     int plainfirst = 0;
277     int plainend = 0;
278     int err = 0;		/* temporary use, may be removed */
279 
280     if (in == NULL || out == NULL) {
281 	return TCL_ERROR;
282     }
283 
284     unic = Tcl_GetUnicode(in);
285     length = Tcl_GetCharLength(in);
286 
287     if (length == 1) {
288 	if ((unic[0] == '>') || (unic[pos] == '>')) {
289 	    /* nada */
290 	    return TCL_OK;
291 	}
292 	else {
293 	    Tcl_AppendUnicodeToObj(out, &unic[0], 1);
294 	    return TCL_OK;
295 	}
296     }
297 
298     while (pos < length) {
299 
300 	plainend = pos;
301 	if (unic[pos] == '<') {
302 
303 	    /* dump */
304 	    Tcl_AppendUnicodeToObj(out, &unic[plainfirst],
305 				   plainend - plainfirst);
306 
307 	    /* ---------------------------------------------------------------------
308 	     * we're in a tag, thus we skip everything
309 	     * --------------------------------------------------------------------*/
310 	    HANDLE_TAG(unic, length, out, pos, err);
311 	    plainfirst = pos + 1;
312 
313 	}
314 	else if (unic[pos] == '>') {
315 
316 	    /* dump */
317 	    Tcl_AppendUnicodeToObj(out, &unic[plainfirst],
318 				   plainend - plainfirst);
319 	    /* syntax error, too many closing '>' */
320 	    Tcl_AppendUnicodeToObj(out, &(unic[pos]), 1);
321 	    /*Tcl_SetStringObj(out,"Error: web::dehtmlify, unbalanced '>'",-1); */
322 	    plainfirst = pos + 1;
323 
324 	}
325 	else if (unic[pos] == '&') {
326 
327 	    /* dump */
328 	    Tcl_AppendUnicodeToObj(out, &unic[plainfirst],
329 				   plainend - plainfirst);
330 	    /*
331 	     * it's an entity
332 	     */
333 	    HANDLE_ENTITY(convData, unic, length, out, pos, err);
334 	    plainfirst = pos + 1;
335 	}
336 
337 	/* ------------------------------------------------------------------------
338 	 * search on
339 	 * ----------------------------------------------------------------------*/
340 	pos++;
341     }
342     /* final dump */
343     if (plainend >= plainfirst && plainend > 0) {
344 	Tcl_AppendUnicodeToObj(out, &unic[plainfirst],
345 			       plainend - plainfirst + 1);
346     }
347 
348     return TCL_OK;
349 }
350 
351 /* ----------------------------------------------------------------------------
352  * findCmtClose
353  * ------------------------------------------------------------------------- */
findHtmlCmtClose(TCLCONST char * utf)354 TCLCONST char *findHtmlCmtClose(TCLCONST char *utf)
355 {
356 
357     TCLCONST char *cmtclose = NULL;
358     TCLCONST char *next1 = NULL;
359     TCLCONST char *next2 = NULL;
360 
361     if (utf == NULL)
362 	return NULL;
363 
364     while ((cmtclose = Tcl_UtfFindFirst(utf, '-')) != NULL) {
365 
366 	next1 = NULL;
367 	next2 = NULL;
368 
369 	next1 = Tcl_UtfNext(cmtclose);
370 	if (next1 != NULL)
371 	    next2 = Tcl_UtfNext(next1);
372 
373 	if ((next1[0] == '-') && (next2[0] == '>')) {
374 	    return next2;
375 	}
376 	utf = Tcl_UtfNext(cmtclose);
377     }
378     return NULL;
379 }
380 
381 /* ----------------------------------------------------------------------------
382  * removeHtmlComments --
383  *   Scans inString for HTML comments. Upon completion, inString will
384  *   contain the input minus all HTML comments.
385  * ------------------------------------------------------------------------- */
removeHtmlComments(Tcl_Interp * interp,Tcl_Obj * in,Tcl_Obj * res)386 int removeHtmlComments(Tcl_Interp * interp, Tcl_Obj * in, Tcl_Obj * res)
387 {
388 
389     int len = 0;
390     TCLCONST char *utf = NULL;
391     TCLCONST char *cmtopen = NULL;
392     TCLCONST char *cmtclose = NULL;
393     TCLCONST char *next1 = NULL;
394     TCLCONST char *next2 = NULL;
395     TCLCONST char *next3 = NULL;
396 
397     if ((in == NULL) || (res == NULL))
398 	return TCL_ERROR;
399 
400     utf = Tcl_GetStringFromObj(in, &len);
401 
402     if (len == 0)
403 	return TCL_OK;
404 
405     /* --------------------------------------------------------------------------
406      * fast forward to first "<"
407      * ----------------------------------------------------------------------- */
408     while ((cmtopen = Tcl_UtfFindFirst(utf, '<')) != NULL) {
409 
410 	next1 = NULL;
411 	next2 = NULL;
412 	next3 = NULL;
413 
414 	next1 = Tcl_UtfNext(cmtopen);
415 	if (next1 != NULL)
416 	    next2 = Tcl_UtfNext(next1);
417 	if (next2 != NULL)
418 	    next3 = Tcl_UtfNext(next2);
419 
420 	if (next1[0] == '!') {
421 	    /* ----------------------------------------------------------------------
422 	     * starts like a comment.
423 	     * ------------------------------------------------------------------- */
424 	    if ((next2[0] == '-') && (next3[0] == '-')) {
425 		Tcl_AppendToObj(res, utf, cmtopen - utf);
426 		cmtclose = findHtmlCmtClose(Tcl_UtfNext(next3));
427 		if (cmtclose == NULL) {
428 		    Tcl_AppendToObj(res, cmtopen, -1);
429 		    LOG_MSG(interp, WRITE_LOG, __FILE__, __LINE__,
430 			    "removeHtmlComments", WEBLOG_INFO,
431 			    "end of string encountered while searching for comment-end",
432 			    NULL);
433 		    return TCL_OK;
434 		}
435 		else {
436 		    utf = Tcl_UtfNext(cmtclose);
437 		}
438 	    }
439 	    else {
440 
441 		if (next2[0] == '>') {
442 		    Tcl_AppendToObj(res, utf, cmtopen - utf);
443 		    utf = next3;
444 		}
445 		else {
446 
447 		    Tcl_AppendToObj(res, utf, cmtopen - utf + 1);
448 		    utf = next1;
449 		}
450 	    }
451 	}
452 	else {
453 	    /* ----------------------------------------------------------------------
454 	     * not a comment. proceed.
455 	     * ------------------------------------------------------------------- */
456 	    Tcl_AppendToObj(res, utf, cmtopen - utf + 1);
457 	    utf = next1;
458 	}
459     }
460     /* ------------------------------------------------------------------------
461      * append rest
462      * --------------------------------------------------------------------- */
463     if (utf != NULL)
464 	Tcl_AppendToObj(res, utf, -1);
465 
466     return TCL_OK;
467 }
468