1 /*
2  * $LynxId: LYCharSets.c,v 1.71 2021/06/29 22:01:12 tom Exp $
3  */
4 #include <HTUtils.h>
5 #include <HTCJK.h>
6 #include <HTMLDTD.h>
7 
8 #include <LYGlobalDefs.h>
9 #include <UCMap.h>
10 #include <UCdomap.h>
11 #include <UCDefs.h>
12 #include <LYCharSets.h>
13 #include <GridText.h>
14 #include <LYCurses.h>
15 #include <LYStrings.h>
16 
17 #include <LYLeaks.h>
18 
19 HTkcode kanji_code = NOKANJI;
20 BOOLEAN LYHaveCJKCharacterSet = FALSE;
21 BOOLEAN DisplayCharsetMatchLocale = TRUE;
22 BOOL force_old_UCLYhndl_on_reload = FALSE;
23 int forced_UCLYhdnl;
24 int LYNumCharsets = 0;		/* Will be initialized later by UC_Register. */
25 int current_char_set = -1;	/* will be initialized later in LYMain.c */
26 int linedrawing_char_set = -1;
27 STRING2PTR p_entity_values = NULL;	/* Pointer, for HTML_put_entity() */
28 
29 			      /* obsolete and probably not used(???)        */
30 			      /* will be initialized in HTMLUseCharacterSet */
31 #ifdef USE_CHARSET_CHOICE
32 charset_subset_t charset_subsets[MAXCHARSETS];
33 BOOL custom_display_charset = FALSE;
34 BOOL custom_assumed_doc_charset = FALSE;
35 
36 #ifndef ALL_CHARSETS_IN_O_MENU_SCREEN
37 int display_charset_map[MAXCHARSETS];
38 int assumed_doc_charset_map[MAXCHARSETS];
39 
40 const char *display_charset_choices[MAXCHARSETS + 1];
41 const char *assumed_charset_choices[MAXCHARSETS + 1];
42 int displayed_display_charset_idx;
43 #endif
44 #endif /* USE_CHARSET_CHOICE */
45 
46 /*
47  * New character sets now declared with UCInit() in UCdomap.c
48  *
49  * INSTRUCTIONS for adding new character sets which do not have
50  *		Unicode tables now in UCdomap.h
51  *
52  *
53  * [We hope you need not correct/add old-style mapping below as in ISO_LATIN1[]
54  * or SevenBitApproximations[] any more - it works now via new chartrans
55  * mechanism, but kept for compatibility only:  we should cleanup the stuff,
56  * but this is not so easy...]
57  *
58  * Currently we only declare some charset's properties here (such as MIME
59  * names, etc.), it does not include real mapping.
60  *
61  * There is a place marked "Add your new character sets HERE" in this file.
62  * Make up a character set and add it in the same style as the ISO_LATIN1 set
63  * below, giving it a unique name.
64  *
65  * Add the name of the set to LYCharSets.  Similarly add the appropriate
66  * information to the tables below:  LYchar_set_names, LYCharSet_UC,
67  * LYlowest_eightbit.  These 4 tables all MUST have the same order.  (And this
68  * is the order you will see in Lynx Options Menu, which is why few
69  * unicode-based charsets are listed here).
70  *
71  */
72 
73 /*	Entity values -- for ISO Latin 1 local representation
74  *
75  *	This MUST match exactly the table referred to in the DTD!
76  */
77 static const char *ISO_Latin1[] =
78 {
79     "\306",			/* capital AE diphthong (ligature) (&#198;) - AElig */
80     "\301",			/* capital A, acute accent (&#193;) - Aacute */
81     "\302",			/* capital A, circumflex accent (&#194;) - Acirc */
82     "\300",			/* capital A, grave accent (&#192;) - Agrave */
83     "\305",			/* capital A, ring - Aring (&#197;) */
84     "\303",			/* capital A, tilde - Atilde (&#195;) */
85     "\304",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
86     "\307",			/* capital C, cedilla - Ccedil (&#199;) */
87     "\320",			/* capital Eth or D with stroke (&#208;) - Dstrok */
88     "\320",			/* capital Eth, Icelandic (&#208;) - ETH */
89     "\311",			/* capital E, acute accent (&#201;) - Eacute */
90     "\312",			/* capital E, circumflex accent (&#202;) - Ecirc */
91     "\310",			/* capital E, grave accent (&#200;) - Egrave */
92     "\313",			/* capital E, dieresis or umlaut mark (&#203;) - Euml */
93     "\315",			/* capital I, acute accent (&#205;) - Iacute */
94     "\316",			/* capital I, circumflex accent (&#206;) - Icirc */
95     "\314",			/* capital I, grave accent (&#204;) - Igrave */
96     "\317",			/* capital I, dieresis or umlaut mark (&#207;) - Iuml */
97     "\321",			/* capital N, tilde (&#209;) - Ntilde */
98     "\323",			/* capital O, acute accent (&#211;) - Oacute */
99     "\324",			/* capital O, circumflex accent (&#212;) - Ocirc */
100     "\322",			/* capital O, grave accent (&#210;) - Ograve */
101     "\330",			/* capital O, slash (&#216;) - Oslash */
102     "\325",			/* capital O, tilde (&#213;) - Otilde */
103     "\326",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
104     "\336",			/* capital THORN, Icelandic (&#222;) - THORN */
105     "\332",			/* capital U, acute accent (&#218;) - Uacute */
106     "\333",			/* capital U, circumflex accent (&#219;) - Ucirc */
107     "\331",			/* capital U, grave accent (&#217;) - Ugrave */
108     "\334",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
109     "\335",			/* capital Y, acute accent (&#221;) - Yacute */
110     "\341",			/* small a, acute accent (&#225;) - aacute */
111     "\342",			/* small a, circumflex accent (&#226;) - acirc */
112     "\264",			/* spacing acute (&#180;) - acute */
113     "\346",			/* small ae diphthong (ligature) (&#230;) - aelig */
114     "\340",			/* small a, grave accent (&#224;) - agrave */
115     "\046",			/* ampersand (&#38;) - amp */
116     "\345",			/* small a, ring (&#229;) - aring */
117     "\343",			/* small a, tilde (&#227;) - atilde */
118     "\344",			/* small a, dieresis or umlaut mark (&#228;) - auml */
119     "\246",			/* broken vertical bar (&#166;) - brkbar */
120     "\246",			/* broken vertical bar (&#166;) - brvbar */
121     "\347",			/* small c, cedilla (&#231;) - ccedil */
122     "\270",			/* spacing cedilla (&#184;) - cedil */
123     "\242",			/* cent sign (&#162;) - cent */
124     "\251",			/* copyright sign (&#169;) - copy */
125     "\244",			/* currency sign (&#164;) - curren */
126     "\260",			/* degree sign (&#176;) - deg */
127     "\250",			/* spacing dieresis (&#168;) - die */
128     "\367",			/* division sign (&#247;) - divide */
129     "\351",			/* small e, acute accent (&#233;) - eacute */
130     "\352",			/* small e, circumflex accent (&#234;) - ecirc */
131     "\350",			/* small e, grave accent (&#232;) - egrave */
132     "-",			/* dash the width of emsp - emdash */
133     "\002",			/* emsp, em space - not collapsed NEVER CHANGE THIS - emsp */
134     "-",			/* dash the width of ensp - endash */
135     "\002",			/* ensp, en space - not collapsed NEVER CHANGE THIS - ensp */
136     "\360",			/* small eth, Icelandic (&#240;) - eth */
137     "\353",			/* small e, dieresis or umlaut mark (&#235;) - euml */
138     "\275",			/* fraction 1/2 (&#189;) - frac12 */
139     "\274",			/* fraction 1/4 (&#188;) - frac14 */
140     "\276",			/* fraction 3/4 (&#190;) - frac34 */
141     "\076",			/* greater than (&#62;) - gt */
142     "\257",			/* spacing macron (&#175;) - hibar */
143     "\355",			/* small i, acute accent (&#237;) - iacute */
144     "\356",			/* small i, circumflex accent (&#238;) - icirc */
145     "\241",			/* inverted exclamation mark (&#161;) - iexcl */
146     "\354",			/* small i, grave accent (&#236;) - igrave */
147     "\277",			/* inverted question mark (&#191;) - iquest */
148     "\357",			/* small i, dieresis or umlaut mark (&#239;) - iuml */
149     "\253",			/* angle quotation mark, left (&#171;) - laquo */
150     "\074",			/* less than (&#60;) - lt */
151     "\257",			/* spacing macron (&#175;) - macr */
152     "-",			/* dash the width of emsp - mdash */
153     "\265",			/* micro sign (&#181;) - micro */
154     "\267",			/* middle dot (&#183;) - middot */
155     "\001",			/* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
156     "-",			/* dash the width of ensp - ndash */
157     "\254",			/* negation sign (&#172;) - not */
158     "\361",			/* small n, tilde (&#241;) - ntilde */
159     "\363",			/* small o, acute accent (&#243;) - oacute */
160     "\364",			/* small o, circumflex accent (&#244;) - ocirc */
161     "\362",			/* small o, grave accent (&#242;) - ograve */
162     "\252",			/* feminine ordinal indicator (&#170;) - ordf */
163     "\272",			/* masculine ordinal indicator (&#186;) - ordm */
164     "\370",			/* small o, slash (&#248;) - oslash */
165     "\365",			/* small o, tilde (&#245;) - otilde */
166     "\366",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
167     "\266",			/* paragraph sign (&#182;) - para */
168     "\261",			/* plus-or-minus sign (&#177;) - plusmn */
169     "\243",			/* pound sign (&#163;) - pound */
170     "\042",			/* quote '"' (&#34;) - quot */
171     "\273",			/* angle quotation mark, right (&#187;) - raquo */
172     "\256",			/* circled R registered sign (&#174;) - reg */
173     "\247",			/* section sign (&#167;) - sect */
174     "\007",			/* soft hyphen (&#173;) NEVER CHANGE THIS - shy */
175     "\271",			/* superscript 1 (&#185;) - sup1 */
176     "\262",			/* superscript 2 (&#178;) - sup2 */
177     "\263",			/* superscript 3 (&#179;) - sup3 */
178     "\337",			/* small sharp s, German (sz ligature) (&#223;) - szlig */
179     "\002",			/* thin space - not collapsed NEVER CHANGE THIS - thinsp */
180     "\376",			/* small thorn, Icelandic (&#254;) - thorn */
181     "\327",			/* multiplication sign (&#215;) - times */
182     "(TM)",			/* circled TM trade mark sign (&#8482;) - trade */
183     "\372",			/* small u, acute accent (&#250;) - uacute */
184     "\373",			/* small u, circumflex accent (&#251;) - ucirc */
185     "\371",			/* small u, grave accent (&#249;) - ugrave */
186     "\250",			/* spacing dieresis (&#168;) - uml */
187     "\374",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
188     "\375",			/* small y, acute accent (&#253;) - yacute */
189     "\245",			/* yen sign (&#165;) - yen */
190     "\377",			/* small y, dieresis or umlaut mark (&#255;) - yuml */
191 };
192 
193 /*	Entity values -- 7 bit character approximations
194  *
195  *	This MUST match exactly the table referred to in the DTD!
196  */
197 const char *SevenBitApproximations[] =
198 {
199     "AE",			/* capital AE diphthong (ligature) (&#198;) - AElig */
200     "A",			/* capital A, acute accent (&#193;) - Aacute */
201     "A",			/* capital A, circumflex accent (&#194;) - Acirc */
202     "A",			/* capital A, grave accent (&#192;) - Agrave */
203     "A",			/* capital A, ring - Aring (&#197;) */
204     "A",			/* capital A, tilde - Atilde (&#195;) */
205 #ifdef LY_UMLAUT
206     "Ae",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
207 #else
208     "A",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
209 #endif				/* LY_UMLAUT */
210     "C",			/* capital C, cedilla (&#199;) - Ccedil */
211     "Dj",			/* capital D with stroke (&#208;) - Dstrok */
212     "DH",			/* capital Eth, Icelandic (&#208;) - ETH */
213     "E",			/* capital E, acute accent (&#201;) - Eacute */
214     "E",			/* capital E, circumflex accent (&#202;) - Ecirc */
215     "E",			/* capital E, grave accent (&#200;) - Egrave */
216     "E",			/* capital E, dieresis or umlaut mark (&#203;) - Euml */
217     "I",			/* capital I, acute accent (&#205;) - Iacute */
218     "I",			/* capital I, circumflex accent (&#206;) - Icirc */
219     "I",			/* capital I, grave accent (&#204;) - Igrave */
220     "I",			/* capital I, dieresis or umlaut mark (&#207;) - Iuml */
221     "N",			/* capital N, tilde - Ntilde (&#209;) */
222     "O",			/* capital O, acute accent (&#211;) - Oacute */
223     "O",			/* capital O, circumflex accent (&#212;) - Ocirc */
224     "O",			/* capital O, grave accent (&#210;) - Ograve */
225     "O",			/* capital O, slash (&#216;) - Oslash */
226     "O",			/* capital O, tilde (&#213;) - Otilde */
227 #ifdef LY_UMLAUT
228     "Oe",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
229 #else
230     "O",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
231 #endif				/* LY_UMLAUT */
232     "P",			/* capital THORN, Icelandic (&#222;) - THORN */
233     "U",			/* capital U, acute accent (&#218;) - Uacute */
234     "U",			/* capital U, circumflex accent (&#219;) - Ucirc */
235     "U",			/* capital U, grave accent (&#217;) - Ugrave */
236 #ifdef LY_UMLAUT
237     "Ue",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
238 #else
239     "U",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
240 #endif				/* LY_UMLAUT */
241     "Y",			/* capital Y, acute accent (&#221;) - Yacute */
242     "a",			/* small a, acute accent (&#225;) - aacute */
243     "a",			/* small a, circumflex accent (&#226;) - acirc */
244     "'",			/* spacing acute (&#180;) - acute */
245     "ae",			/* small ae diphthong (ligature) (&#230;) - aelig */
246     "`a",			/* small a, grave accent (&#232;) - agrave */
247     "&",			/* ampersand (&#38;) - amp */
248     "a",			/* small a, ring (&#229;) - aring */
249     "a",			/* small a, tilde (&#227;) - atilde */
250 #ifdef LY_UMLAUT
251     "ae",			/* small a, dieresis or umlaut mark (&#228;) - auml */
252 #else
253     "a",			/* small a, dieresis or umlaut mark (&#228;) - auml */
254 #endif				/* LY_UMLAUT */
255     "|",			/* broken vertical bar (&#166;) - brkbar */
256     "|",			/* broken vertical bar (&#166;) - brvbar */
257     "c",			/* small c, cedilla (&#231;) - ccedil */
258     ",",			/* spacing cedilla (&#184;) - cedil */
259     "-c-",			/* cent sign (&#162;) - cent */
260     "(c)",			/* copyright sign (&#169;) - copy */
261     "CUR",			/* currency sign (&#164;) - curren */
262     "DEG",			/* degree sign (&#176;) - deg */
263     "\042",			/* spacing dieresis (&#168;) - die */
264     "/",			/* division sign (&#247;) - divide */
265     "e",			/* small e, acute accent (&#233;) - eacute */
266     "e",			/* small e, circumflex accent (&#234;) - ecirc */
267     "e",			/* small e, grave accent (&#232;) - egrave */
268     "-",			/* dash the width of emsp - emdash */
269     "\002",			/* emsp NEVER CHANGE THIS - emsp */
270     "-",			/* dash the width of ensp - endash */
271     "\002",			/* ensp NEVER CHANGE THIS - ensp */
272     "dh",			/* small eth, Icelandic eth (&#240;) */
273     "e",			/* small e, dieresis or umlaut mark (&#235;) - euml */
274     " 1/2",			/* fraction 1/2 (&#189;) - frac12 */
275     " 1/4",			/* fraction 1/4 (&#188;) - frac14 */
276     " 3/4",			/* fraction 3/4 (&#190;) - frac34 */
277     ">",			/* greater than (&#62;) - gt */
278     "-",			/* spacing macron (&#175;) - hibar */
279     "i",			/* small i, acute accent (&#237;) - iacute */
280     "i",			/* small i, circumflex accent (&#238;) - icirc */
281     "!",			/* inverted exclamation mark (&#161;) - iexcl */
282     "`i",			/* small i, grave accent (&#236;) - igrave */
283     "?",			/* inverted question mark (&#191;) - iquest */
284     "i",			/* small i, dieresis or umlaut mark (&#239;) - iuml */
285     "<<",			/* angle quotation mark, left (&#171;) - laquo */
286     "<",			/* less than - lt (&#60;) */
287     "-",			/* spacing macron (&#175;) - macr */
288     "-",			/* dash the width of emsp - mdash */
289     "u",			/* micro sign (&#181;) - micro */
290     ".",			/* middle dot (&#183;) - middot */
291     "\001",			/* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
292     "-",			/* dash the width of ensp - ndash */
293     "NOT",			/* negation sign (&#172;) - not */
294     "n",			/* small n, tilde (&#241;) - ntilde */
295     "o",			/* small o, acute accent (&#243;) - oacute */
296     "o",			/* small o, circumflex accent (&#244;) - ocirc */
297     "o",			/* small o, grave accent (&#242;) - ograve */
298     "-a",			/* feminine ordinal indicator (&#170;) - ordf */
299     "-o",			/* masculine ordinal indicator (&#186;) - ordm */
300     "o",			/* small o, slash (&#248;) - oslash */
301     "o",			/* small o, tilde (&#245;) - otilde */
302 #ifdef LY_UMLAUT
303     "oe",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
304 #else
305     "o",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
306 #endif				/* LY_UMLAUT */
307     "P:",			/* paragraph sign (&#182;) - para */
308     "+-",			/* plus-or-minus sign (&#177;) - plusmn */
309     "-L-",			/* pound sign (&#163;) - pound */
310     "\"",			/* quote '"' (&#34;) - quot */
311     ">>",			/* angle quotation mark, right (&#187;) - raquo */
312     "(R)",			/* circled R registered sign (&#174;) - reg */
313     "S:",			/* section sign (&#167;) - sect */
314     "\007",			/* soft hyphen (&#173;) NEVER CHANGE THIS - shy */
315     "^1",			/* superscript 1 (&#185;) - sup1 */
316     "^2",			/* superscript 2 (&#178;) - sup2 */
317     "^3",			/* superscript 3 (&#179;) - sup3 */
318     "ss",			/* small sharp s, German (sz ligature) (&#223;) - szlig */
319     "\002",			/* thin space - not collapsed NEVER CHANGE THIS - thinsp */
320     "p",			/* small thorn, Icelandic (&#254;) - thorn */
321     "*",			/* multiplication sign (&#215;) - times */
322     "(TM)",			/* circled TM trade mark sign (&#8482;) - trade */
323     "u",			/* small u, acute accent (&#250;) - uacute */
324     "u",			/* small u, circumflex accent (&#251;) - ucirc */
325     "u",			/* small u, grave accent (&#249;) - ugrave */
326     "\042",			/* spacing dieresis (&#168;) - uml */
327 #ifdef LY_UMLAUT
328     "ue",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
329 #else
330     "u",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
331 #endif				/* LY_UMLAUT */
332     "y",			/* small y, acute accent (&#253;) - yacute */
333     "YEN",			/* yen sign (&#165;) - yen */
334     "y",			/* small y, dieresis or umlaut mark (&#255;) - yuml */
335 };
336 
337 /*
338  * Add your new character sets HERE (but only if you can't construct Unicode
339  * tables for them).  - FM
340  */
341 
342 /*
343  * Add the array name to LYCharSets
344  */
345 STRING2PTR LYCharSets[MAXCHARSETS] =
346 {
347     ISO_Latin1,			/* ISO Latin 1          */
348     SevenBitApproximations,	/* 7 Bit Approximations */
349 };
350 
351 /*
352  * Add the name that the user will see below.  The order of LYCharSets and
353  * LYchar_set_names MUST be the same
354  */
355 const char *LYchar_set_names[MAXCHARSETS + 1] =
356 {
357     "Western (ISO-8859-1)",
358     "7 bit approximations (US-ASCII)",
359     (char *) 0
360 };
361 
362 /*
363  * Associate additional pieces of info with each of the charsets listed above.
364  * Will be automatically modified (and extended) by charset translations which
365  * are loaded using the chartrans mechanism.  Most important piece of info to
366  * put here is a MIME charset name.  Used for chartrans (see UCDefs.h).  The
367  * order of LYCharSets and LYCharSet_UC MUST be the same.
368  *
369  * Note that most of the charsets added by the new mechanism in src/chrtrans
370  * don't show up here at all.  They don't have to.
371  */
372 LYUCcharset LYCharSet_UC[MAXCHARSETS] =
373 {
374   /*
375    * Zero position placeholder and HTMLGetEntityUCValue() reference.  - FM
376    */
377     {-1, "iso-8859-1", UCT_ENC_8BIT, 0,
378      UCT_REP_IS_LAT1,
379      UCT_CP_IS_LAT1, UCT_R_LAT1, UCT_R_LAT1},
380 
381   /*
382    * Placeholders for Unicode tables.  - FM
383    */
384     {-1, "us-ascii", UCT_ENC_7BIT, 0,
385      UCT_REP_SUBSETOF_LAT1,
386      UCT_CP_SUBSETOF_LAT1, UCT_R_ASCII, UCT_R_ASCII},
387 
388 };
389 
390 /*
391  * Add the code of the the lowest character with the high bit set that can be
392  * directly displayed.  The order of LYCharSets and LYlowest_eightbit MUST be
393  * the same.
394  *
395  * (If charset have chartrans unicode table, LYlowest_eightbit will be
396  * verified/modified anyway.)
397  */
398 int LYlowest_eightbit[MAXCHARSETS] =
399 {
400     160,			/* ISO Latin 1          */
401     999,			/* 7 bit approximations */
402 };
403 
404 /*
405  * Function to set the handling of selected character sets based on the current
406  * LYUseDefaultRawMode value.  - FM
407  */
HTMLSetCharacterHandling(int i)408 void HTMLSetCharacterHandling(int i)
409 {
410     int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
411     BOOLEAN LYRawMode_flag = LYRawMode;
412     int UCLYhndl_for_unspec_flag = UCLYhndl_for_unspec;
413 
414     if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
415 	HTCJK = NOCJK;
416 	kanji_code = NOKANJI;
417 	if (i == chndl)
418 	    LYRawMode = LYUseDefaultRawMode;
419 	else
420 	    LYRawMode = (BOOL) (!LYUseDefaultRawMode);
421 
422 	HTPassEightBitNum = (BOOL) ((LYCharSet_UC[i].codepoints & UCT_CP_SUPERSETOF_LAT1)
423 				    || (LYCharSet_UC[i].like8859 & UCT_R_HIGH8BIT));
424 
425 	if (LYRawMode) {
426 	    HTPassEightBitRaw = (BOOL) (LYlowest_eightbit[i] <= 160);
427 	} else {
428 	    HTPassEightBitRaw = FALSE;
429 	}
430 	if (LYRawMode || i == chndl) {
431 	    HTPassHighCtrlRaw = (BOOL) (LYlowest_eightbit[i] <= 130);
432 	} else {
433 	    HTPassHighCtrlRaw = FALSE;
434 	}
435 
436 	HTPassHighCtrlNum = FALSE;
437 
438     } else {			/* CJK encoding: */
439 	const char *mime = LYCharSet_UC[i].MIMEname;
440 
441 	if (!strcmp(mime, "euc-cn")) {
442 	    HTCJK = CHINESE;
443 	    kanji_code = EUC;
444 	} else if (!strcmp(mime, "euc-jp")) {
445 	    HTCJK = JAPANESE;
446 	    kanji_code = EUC;
447 	} else if (!strcmp(mime, "shift_jis")) {
448 	    HTCJK = JAPANESE;
449 	    kanji_code = SJIS;
450 	} else if (!strcmp(mime, "euc-kr")) {
451 	    HTCJK = KOREAN;
452 	    kanji_code = EUC;
453 	} else if (!strcmp(mime, "big5")) {
454 	    HTCJK = TAIPEI;
455 	    kanji_code = EUC;
456 	}
457 
458 	/* for any CJK: */
459 	if (!LYUseDefaultRawMode)
460 	    HTCJK = NOCJK;
461 	LYRawMode = (BOOL) (IS_CJK_TTY ? TRUE : FALSE);
462 	HTPassEightBitRaw = FALSE;
463 	HTPassEightBitNum = FALSE;
464 	HTPassHighCtrlRaw = (BOOL) (IS_CJK_TTY ? TRUE : FALSE);
465 	HTPassHighCtrlNum = FALSE;
466     }
467 
468     /*
469      * Comment for coding below:
470      * UCLYhndl_for_unspec is "current" state with LYRawMode, but
471      * UCAssume_MIMEcharset is independent from LYRawMode:  holds the history
472      * and may be changed from 'O'ptions menu only.  - LP
473      */
474     if (LYRawMode) {
475 	UCLYhndl_for_unspec = i;	/* UCAssume_MIMEcharset not changed! */
476     } else {
477 	if (chndl != i &&
478 	    (LYCharSet_UC[i].enc != UCT_ENC_CJK ||
479 	     LYCharSet_UC[chndl].enc != UCT_ENC_CJK)) {
480 	    UCLYhndl_for_unspec = chndl;	/* fall to UCAssume_MIMEcharset */
481 	} else {
482 	    UCLYhndl_for_unspec = LATIN1;	/* UCAssume_MIMEcharset not changed! */
483 	}
484     }
485 
486 #ifdef USE_SLANG
487     if (LYlowest_eightbit[i] > 191) {
488 	/*
489 	 * Higher than this may output cntrl chars to screen.  - KW
490 	 */
491 	SLsmg_Display_Eight_Bit = 191;
492     } else {
493 	SLsmg_Display_Eight_Bit = LYlowest_eightbit[i];
494     }
495 #endif /* USE_SLANG */
496 
497     ena_csi(LYlowest_eightbit[current_char_set] > 155);
498 
499     /* some diagnostics */
500     if (TRACE) {
501 	if (LYRawMode_flag != LYRawMode)
502 	    CTRACE((tfp,
503 		    "HTMLSetCharacterHandling: LYRawMode changed %s -> %s\n",
504 		    (LYRawMode_flag ? "ON" : "OFF"),
505 		    (LYRawMode ? "ON" : "OFF")));
506 	if (UCLYhndl_for_unspec_flag != UCLYhndl_for_unspec)
507 	    CTRACE((tfp,
508 		    "HTMLSetCharacterHandling: UCLYhndl_for_unspec changed %d -> %d\n",
509 		    UCLYhndl_for_unspec_flag,
510 		    UCLYhndl_for_unspec));
511     }
512 
513     return;
514 }
515 
516 /*
517  * Function to set HTCJK based on "in" and "out" charsets.
518  */
Set_HTCJK(const char * inMIMEname,const char * outMIMEname)519 void Set_HTCJK(const char *inMIMEname,
520 	       const char *outMIMEname)
521 {
522     /* need not check for synonyms: MIMEname's got from LYCharSet_UC */
523 
524     if (LYRawMode) {
525 	if ((!strcmp(inMIMEname, "euc-jp") ||
526 #ifdef USE_JAPANESEUTF8_SUPPORT
527 	     !strcmp(inMIMEname, "utf-8") ||
528 #endif
529 	     !strcmp(inMIMEname, "shift_jis")) &&
530 	    (!strcmp(outMIMEname, "euc-jp") ||
531 	     !strcmp(outMIMEname, "shift_jis"))) {
532 	    HTCJK = JAPANESE;
533 	} else if (!strcmp(inMIMEname, "euc-cn") &&
534 		   !strcmp(outMIMEname, "euc-cn")) {
535 	    HTCJK = CHINESE;
536 	} else if (!strcmp(inMIMEname, "big5") &&
537 		   !strcmp(outMIMEname, "big5")) {
538 	    HTCJK = TAIPEI;
539 	} else if (!strcmp(inMIMEname, "euc-kr") &&
540 		   !strcmp(outMIMEname, "euc-kr")) {
541 	    HTCJK = KOREAN;
542 	} else {
543 	    HTCJK = NOCJK;
544 	}
545     } else {
546 	HTCJK = NOCJK;
547     }
548 }
549 
550 /*
551  * Function to set the LYDefaultRawMode value based on the selected character
552  * set.  - FM
553  *
554  * Currently unused:  the default value so obvious that LYUseDefaultRawMode
555  * utilized directly by someone's mistake.  - LP
556  */
HTMLSetRawModeDefault(int i)557 static void HTMLSetRawModeDefault(int i)
558 {
559     LYDefaultRawMode = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK);
560     return;
561 }
562 
563 /*
564  * Function to set the LYUseDefaultRawMode value based on the selected
565  * character set and the current LYRawMode value.  - FM
566  */
HTMLSetUseDefaultRawMode(int i,int modeflag)567 void HTMLSetUseDefaultRawMode(int i,
568 			      int modeflag)
569 {
570     if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
571 
572 	int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
573 
574 	if (i == chndl)
575 	    LYUseDefaultRawMode = (BOOLEAN) modeflag;
576 	else
577 	    LYUseDefaultRawMode = (BOOL) (!modeflag);
578     } else			/* CJK encoding: */
579 	LYUseDefaultRawMode = (BOOLEAN) modeflag;
580 
581     return;
582 }
583 
584 /*
585  * Function to set the LYHaveCJKCharacterSet value based on the selected
586  * character set.  - FM
587  */
HTMLSetHaveCJKCharacterSet(int i)588 static void HTMLSetHaveCJKCharacterSet(int i)
589 {
590     LYHaveCJKCharacterSet = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK);
591     return;
592 }
593 
594 /*
595  * Function to set the DisplayCharsetMatchLocale value based on the selected
596  * character set.  It is used in UPPER8 for 8bit case-insensitive search by
597  * matching def7_uni.tbl images.  - LP
598  */
HTMLSetDisplayCharsetMatchLocale(int i)599 static void HTMLSetDisplayCharsetMatchLocale(int i)
600 {
601     BOOLEAN match;
602 
603     if (LYHaveCJKCharacterSet) {
604 	/*
605 	 * We have no intention to pass CJK via UCTransChar if that happened.
606 	 * Let someone from CJK correct this if necessary.
607 	 */
608 	DisplayCharsetMatchLocale = TRUE;	/* old-style */
609 	return;
610 
611     } else if (strncasecomp(LYCharSet_UC[i].MIMEname, "cp", 2) ||
612 	       strncasecomp(LYCharSet_UC[i].MIMEname, "windows", 7)) {
613 	/*
614 	 * Assume dos/windows displays usually on remote terminal, hence it
615 	 * rarely matches locale.  (In fact, MS Windows codepoints locale are
616 	 * never seen on UNIX).
617 	 */
618 	match = FALSE;
619     } else {
620 	match = TRUE;		/* guess, but see below */
621 
622 #if !defined(LOCALE)
623 	if (LYCharSet_UC[i].enc != UCT_ENC_UTF8)
624 	    /*
625 	     * Leave true for utf-8 display - the code doesn't deal very well
626 	     * with this case.  - kw
627 	     */
628 	    match = FALSE;
629 #else
630 	if (UCForce8bitTOUPPER) {
631 	    /*
632 	     * Force disable locale (from lynx.cfg)
633 	     */
634 	    match = FALSE;
635 	}
636 #endif
637     }
638 
639     DisplayCharsetMatchLocale = match;
640     return;
641 }
642 
643 /*
644  * lynx 2.8/2.7.2(and more early) compatibility code:  "human-readable" charset
645  * names changes with time so we map that history names to MIME here to get old
646  * lynx.cfg and (especially) .lynxrc always recognized.  Please update this
647  * table when you change "fullname" of any present charset.
648  */
649 typedef struct _names_pairs {
650     const char *fullname;
651     const char *MIMEname;
652 } names_pairs;
653 /* *INDENT-OFF* */
654 static const names_pairs OLD_charset_names[] =
655 {
656     {"ISO Latin 1",		"iso-8859-1"},
657     {"ISO Latin 2",             "iso-8859-2"},
658     {"WinLatin1 (cp1252)",      "windows-1252"},
659     {"DEC Multinational",       "dec-mcs"},
660     {"Macintosh (8 bit)",       "macintosh"},
661     {"NeXT character set",      "next"},
662     {"KOI8-R Cyrillic",         "koi8-r"},
663     {"Chinese",                 "euc-cn"},
664     {"Japanese (EUC)",          "euc-jp"},
665     {"Japanese (SJIS)",         "shift_jis"},
666     {"Korean",                  "euc-kr"},
667     {"Taipei (Big5)",           "big5"},
668     {"Vietnamese (VISCII)",     "viscii"},
669     {"7 bit approximations",    "us-ascii"},
670     {"Transparent",             "x-transparent"},
671     {"DosLatinUS (cp437)",      "cp437"},
672     {"IBM PC character set",    "cp437"},
673     {"DosLatin1 (cp850)",       "cp850"},
674     {"IBM PC codepage 850",     "cp850"},
675     {"DosLatin2 (cp852)",       "cp852"},
676     {"PC Latin2 CP 852",        "cp852"},
677     {"DosCyrillic (cp866)",     "cp866"},
678     {"DosArabic (cp864)",       "cp864"},
679     {"DosGreek (cp737)",        "cp737"},
680     {"DosBaltRim (cp775)",      "cp775"},
681     {"DosGreek2 (cp869)",       "cp869"},
682     {"DosHebrew (cp862)",       "cp862"},
683     {"WinLatin2 (cp1250)",      "windows-1250"},
684     {"WinCyrillic (cp1251)",    "windows-1251"},
685     {"WinGreek (cp1253)",       "windows-1253"},
686     {"WinHebrew (cp1255)",      "windows-1255"},
687     {"WinArabic (cp1256)",      "windows-1256"},
688     {"WinBaltRim (cp1257)",     "windows-1257"},
689     {"ISO Latin 3",             "iso-8859-3"},
690     {"ISO Latin 4",             "iso-8859-4"},
691     {"ISO 8859-5 Cyrillic",     "iso-8859-5"},
692     {"ISO 8859-6 Arabic",       "iso-8859-6"},
693     {"ISO 8859-7 Greek",        "iso-8859-7"},
694     {"ISO 8859-8 Hebrew",       "iso-8859-8"},
695     {"ISO-8859-8-I",            "iso-8859-8"},
696     {"ISO-8859-8-E",            "iso-8859-8"},
697     {"ISO 8859-9 (Latin 5)",    "iso-8859-9"},
698     {"ISO 8859-10",             "iso-8859-10"},
699     {"UNICODE UTF 8",           "utf-8"},
700     {"RFC 1345 w/o Intro",      "mnemonic+ascii+0"},
701     {"RFC 1345 Mnemonic",       "mnemonic"},
702     {NULL, NULL},		/* terminated with NULL */
703 };
704 /* *INDENT-ON* */
705 
706 /*
707  * lynx 2.8/2.7.2 compatibility code:  read "character_set" parameter from
708  * lynx.cfg and .lynxrc in both MIME name and "human-readable" name (old and
709  * new style).  Returns -1 if not recognized.
710  */
UCGetLYhndl_byAnyName(char * value)711 int UCGetLYhndl_byAnyName(char *value)
712 {
713     int i;
714 
715     if (value == NULL)
716 	return -1;
717 
718     LYTrimTrailing(value);
719     CTRACE((tfp, "UCGetLYhndl_byAnyName(%s)\n", value));
720 
721     /* search by name */
722     for (i = 0; (i < MAXCHARSETS && LYchar_set_names[i]); i++) {
723 	if (!strcmp(value, LYchar_set_names[i])) {
724 	    return i;		/* OK */
725 	}
726     }
727 
728     /* search by old name from 2.8/2.7.2 version */
729     for (i = 0; (OLD_charset_names[i].fullname); i++) {
730 	if (!strcmp(value, OLD_charset_names[i].fullname)) {
731 	    return UCGetLYhndl_byMIME(OLD_charset_names[i].MIMEname);	/* OK */
732 	}
733     }
734 
735     return UCGetLYhndl_byMIME(value);	/* by MIME */
736 }
737 
738 /*
739  * Entity names -- Ordered by ISO Latin 1 value.
740  * ---------------------------------------------
741  * For conversions of DECIMAL escaped entities.
742  * Must be in order of ascending value.
743  */
744 static const char *LYEntityNames[] =
745 {
746 /*	 NAME		   DECIMAL VALUE */
747     "nbsp",			/* 160, non breaking space */
748     "iexcl",			/* 161, inverted exclamation mark */
749     "cent",			/* 162, cent sign */
750     "pound",			/* 163, pound sign */
751     "curren",			/* 164, currency sign */
752     "yen",			/* 165, yen sign */
753     "brvbar",			/* 166, broken vertical bar, (brkbar) */
754     "sect",			/* 167, section sign */
755     "uml",			/* 168, spacing dieresis */
756     "copy",			/* 169, copyright sign */
757     "ordf",			/* 170, feminine ordinal indicator */
758     "laquo",			/* 171, angle quotation mark, left */
759     "not",			/* 172, negation sign */
760     "shy",			/* 173, soft hyphen */
761     "reg",			/* 174, circled R registered sign */
762     "hibar",			/* 175, spacing macron */
763     "deg",			/* 176, degree sign */
764     "plusmn",			/* 177, plus-or-minus sign */
765     "sup2",			/* 178, superscript 2 */
766     "sup3",			/* 179, superscript 3 */
767     "acute",			/* 180, spacing acute (96) */
768     "micro",			/* 181, micro sign */
769     "para",			/* 182, paragraph sign */
770     "middot",			/* 183, middle dot */
771     "cedil",			/* 184, spacing cedilla */
772     "sup1",			/* 185, superscript 1 */
773     "ordm",			/* 186, masculine ordinal indicator */
774     "raquo",			/* 187, angle quotation mark, right */
775     "frac14",			/* 188, fraction 1/4 */
776     "frac12",			/* 189, fraction 1/2 */
777     "frac34",			/* 190, fraction 3/4 */
778     "iquest",			/* 191, inverted question mark */
779     "Agrave",			/* 192, capital A, grave accent */
780     "Aacute",			/* 193, capital A, acute accent */
781     "Acirc",			/* 194, capital A, circumflex accent */
782     "Atilde",			/* 195, capital A, tilde */
783     "Auml",			/* 196, capital A, dieresis or umlaut mark */
784     "Aring",			/* 197, capital A, ring */
785     "AElig",			/* 198, capital AE diphthong (ligature) */
786     "Ccedil",			/* 199, capital C, cedilla */
787     "Egrave",			/* 200, capital E, grave accent */
788     "Eacute",			/* 201, capital E, acute accent */
789     "Ecirc",			/* 202, capital E, circumflex accent */
790     "Euml",			/* 203, capital E, dieresis or umlaut mark */
791     "Igrave",			/* 204, capital I, grave accent */
792     "Iacute",			/* 205, capital I, acute accent */
793     "Icirc",			/* 206, capital I, circumflex accent */
794     "Iuml",			/* 207, capital I, dieresis or umlaut mark */
795     "ETH",			/* 208, capital Eth, Icelandic (or Latin2 Dstrok) */
796     "Ntilde",			/* 209, capital N, tilde */
797     "Ograve",			/* 210, capital O, grave accent */
798     "Oacute",			/* 211, capital O, acute accent */
799     "Ocirc",			/* 212, capital O, circumflex accent */
800     "Otilde",			/* 213, capital O, tilde */
801     "Ouml",			/* 214, capital O, dieresis or umlaut mark */
802     "times",			/* 215, multiplication sign */
803     "Oslash",			/* 216, capital O, slash */
804     "Ugrave",			/* 217, capital U, grave accent */
805     "Uacute",			/* 218, capital U, acute accent */
806     "Ucirc",			/* 219, capital U, circumflex accent */
807     "Uuml",			/* 220, capital U, dieresis or umlaut mark */
808     "Yacute",			/* 221, capital Y, acute accent */
809     "THORN",			/* 222, capital THORN, Icelandic */
810     "szlig",			/* 223, small sharp s, German (sz ligature) */
811     "agrave",			/* 224, small a, grave accent */
812     "aacute",			/* 225, small a, acute accent */
813     "acirc",			/* 226, small a, circumflex accent */
814     "atilde",			/* 227, small a, tilde */
815     "auml",			/* 228, small a, dieresis or umlaut mark */
816     "aring",			/* 229, small a, ring */
817     "aelig",			/* 230, small ae diphthong (ligature) */
818     "ccedil",			/* 231, small c, cedilla */
819     "egrave",			/* 232, small e, grave accent */
820     "eacute",			/* 233, small e, acute accent */
821     "ecirc",			/* 234, small e, circumflex accent */
822     "euml",			/* 235, small e, dieresis or umlaut mark */
823     "igrave",			/* 236, small i, grave accent */
824     "iacute",			/* 237, small i, acute accent */
825     "icirc",			/* 238, small i, circumflex accent */
826     "iuml",			/* 239, small i, dieresis or umlaut mark */
827     "eth",			/* 240, small eth, Icelandic */
828     "ntilde",			/* 241, small n, tilde */
829     "ograve",			/* 242, small o, grave accent */
830     "oacute",			/* 243, small o, acute accent */
831     "ocirc",			/* 244, small o, circumflex accent */
832     "otilde",			/* 245, small o, tilde */
833     "ouml",			/* 246, small o, dieresis or umlaut mark */
834     "divide",			/* 247, division sign */
835     "oslash",			/* 248, small o, slash */
836     "ugrave",			/* 249, small u, grave accent */
837     "uacute",			/* 250, small u, acute accent */
838     "ucirc",			/* 251, small u, circumflex accent */
839     "uuml",			/* 252, small u, dieresis or umlaut mark */
840     "yacute",			/* 253, small y, acute accent */
841     "thorn",			/* 254, small thorn, Icelandic */
842     "yuml",			/* 255, small y, dieresis or umlaut mark */
843 };
844 
845 /*
846  * Function to return the entity names of ISO-8859-1 8-bit characters.  - FM
847  */
HTMLGetEntityName(UCode_t code)848 const char *HTMLGetEntityName(UCode_t code)
849 {
850 #define IntValue code
851     int MaxValue = (TABLESIZE(LYEntityNames) - 1);
852 
853     if (IntValue < 0 || IntValue > MaxValue) {
854 	return "";
855     }
856 
857     return LYEntityNames[IntValue];
858 }
859 
860 /*
861  * Function to return the UCode_t (long int) value for entity names.  It
862  * returns 0 if not found.
863  *
864  * unicode_entities[] handles all the names from old style entities[] too.
865  * Lynx now calls unicode_entities[] only through this function:
866  * HTMLGetEntityUCValue().  Note, we need not check for special characters here
867  * in function or even before it, we should check them *after* invoking this
868  * function, see put_special_unicodes() in SGML.c.
869  *
870  * In the future we will try to isolate all calls to entities[] in favor of new
871  * unicode-based chartrans scheme.  - LP
872  */
HTMLGetEntityUCValue(const char * name)873 UCode_t HTMLGetEntityUCValue(const char *name)
874 {
875 #include <entities.h>
876 
877     UCode_t value = 0;
878     size_t i, high, low;
879     int diff = 0;
880     size_t number_of_unicode_entities = TABLESIZE(unicode_entities);
881 
882     /*
883      * Make sure we have a non-zero length name.  - FM
884      */
885     if (isEmpty(name))
886 	return (value);
887 
888     /*
889      * Try UC_entity_info unicode_entities[].
890      */
891     for (low = 0, high = number_of_unicode_entities;
892 	 high > low;
893 	 diff < 0 ? (low = i + 1) : (high = i)) {
894 	/*
895 	 * Binary search.
896 	 */
897 	i = (low + (high - low) / 2);
898 	diff = AS_cmp(unicode_entities[i].name, name);	/* Case sensitive! */
899 	if (diff == 0) {
900 	    value = unicode_entities[i].code;
901 	    break;
902 	}
903     }
904     return (value);
905 }
906 
907 /*
908  * Original comment -
909  * Assume these are Microsoft code points, inflicted on us by FrontPage.  - FM
910  *
911  * MS FrontPage uses syntax like &#153; in 128-159 range and doesn't follow
912  * Unicode standards for this area.  Windows-1252 codepoints are assumed here.
913  *
914  * However see -
915  * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0
916  */
LYcp1252ToUnicode(UCode_t code)917 UCode_t LYcp1252ToUnicode(UCode_t code)
918 {
919     if ((code == 1) ||
920 	(code > 127 && code < 160)) {
921 	switch (code) {
922 	case 1:
923 	    /*
924 	     * WHITE SMILING FACE
925 	     */
926 	    code = 0x263a;
927 	    break;
928 	case 128:
929 	    /*
930 	     * EURO currency sign
931 	     */
932 	    code = 0x20ac;
933 	    break;
934 	case 130:
935 	    /*
936 	     * SINGLE LOW-9 QUOTATION MARK (sbquo)
937 	     */
938 	    code = 0x201a;
939 	    break;
940 	case 131:
941 	    /*
942 	     * LATIN SMALL LETTER F WITH HOOK
943 	     */
944 	    code = 0x192;
945 	    break;
946 	case 132:
947 	    /*
948 	     * DOUBLE LOW-9 QUOTATION MARK (bdquo)
949 	     */
950 	    code = 0x201e;
951 	    break;
952 	case 133:
953 	    /*
954 	     * HORIZONTAL ELLIPSIS (hellip)
955 	     */
956 	    code = 0x2026;
957 	    break;
958 	case 134:
959 	    /*
960 	     * DAGGER (dagger)
961 	     */
962 	    code = 0x2020;
963 	    break;
964 	case 135:
965 	    /*
966 	     * DOUBLE DAGGER (Dagger)
967 	     */
968 	    code = 0x2021;
969 	    break;
970 	case 136:
971 	    /*
972 	     * MODIFIER LETTER CIRCUMFLEX ACCENT
973 	     */
974 	    code = 0x2c6;
975 	    break;
976 	case 137:
977 	    /*
978 	     * PER MILLE SIGN (permil)
979 	     */
980 	    code = 0x2030;
981 	    break;
982 	case 138:
983 	    /*
984 	     * LATIN CAPITAL LETTER S WITH CARON
985 	     */
986 	    code = 0x160;
987 	    break;
988 	case 139:
989 	    /*
990 	     * SINGLE LEFT-POINTING ANGLE QUOTATION MARK (lsaquo)
991 	     */
992 	    code = 0x2039;
993 	    break;
994 	case 140:
995 	    /*
996 	     * LATIN CAPITAL LIGATURE OE
997 	     */
998 	    code = 0x152;
999 	    break;
1000 	case 142:
1001 	    /*
1002 	     * LATIN CAPITAL LETTER Z WITH CARON
1003 	     */
1004 	    code = 0x17d;
1005 	    break;
1006 	case 145:
1007 	    /*
1008 	     * LEFT SINGLE QUOTATION MARK (lsquo)
1009 	     */
1010 	    code = 0x2018;
1011 	    break;
1012 	case 146:
1013 	    /*
1014 	     * RIGHT SINGLE QUOTATION MARK (rsquo)
1015 	     */
1016 	    code = 0x2019;
1017 	    break;
1018 	case 147:
1019 	    /*
1020 	     * LEFT DOUBLE QUOTATION MARK (ldquo)
1021 	     */
1022 	    code = 0x201c;
1023 	    break;
1024 	case 148:
1025 	    /*
1026 	     * RIGHT DOUBLE QUOTATION MARK (rdquo)
1027 	     */
1028 	    code = 0x201d;
1029 	    break;
1030 	case 149:
1031 	    /*
1032 	     * BULLET (bull)
1033 	     */
1034 	    code = 0x2022;
1035 	    break;
1036 	case 150:
1037 	    /*
1038 	     * EN DASH (ndash)
1039 	     */
1040 	    code = 0x2013;
1041 	    break;
1042 	case 151:
1043 	    /*
1044 	     * EM DASH (mdash)
1045 	     */
1046 	    code = 0x2014;
1047 	    break;
1048 	case 152:
1049 	    /*
1050 	     * SMALL TILDE (tilde)
1051 	     */
1052 	    code = 0x02dc;
1053 	    break;
1054 	case 153:
1055 	    /*
1056 	     * TRADE MARK SIGN (trade)
1057 	     */
1058 	    code = 0x2122;
1059 	    break;
1060 	case 154:
1061 	    /*
1062 	     * LATIN SMALL LETTER S WITH CARON
1063 	     */
1064 	    code = 0x161;
1065 	    break;
1066 	case 155:
1067 	    /*
1068 	     * SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (rsaquo)
1069 	     */
1070 	    code = 0x203a;
1071 	    break;
1072 	case 156:
1073 	    /*
1074 	     * LATIN SMALL LIGATURE OE
1075 	     */
1076 	    code = 0x153;
1077 	    break;
1078 	case 158:
1079 	    /*
1080 	     * LATIN SMALL LETTER Z WITH CARON
1081 	     */
1082 	    code = 0x17e;
1083 	    break;
1084 	case 159:
1085 	    /*
1086 	     * LATIN CAPITAL LETTER Y WITH DIAERESIS
1087 	     */
1088 	    code = 0x178;
1089 	    break;
1090 	default:
1091 	    /*
1092 	     * Undefined (by convention, use the replacement character).
1093 	     */
1094 	    code = UCS_REPL;
1095 	    break;
1096 	}
1097     }
1098     return code;
1099 }
1100 
1101 /*
1102  * Function to select a character set and then set the character handling and
1103  * LYHaveCJKCharacterSet flag.  - FM
1104  */
HTMLUseCharacterSet(int i)1105 void HTMLUseCharacterSet(int i)
1106 {
1107     HTMLSetRawModeDefault(i);
1108     p_entity_values = LYCharSets[i];
1109     HTMLSetCharacterHandling(i);	/* set LYRawMode and CJK attributes */
1110     HTMLSetHaveCJKCharacterSet(i);
1111     HTMLSetDisplayCharsetMatchLocale(i);
1112     return;
1113 }
1114 
1115 /*
1116  * Initializer, calls initialization function for the CHARTRANS handling.  - KW
1117  */
LYCharSetsDeclared(void)1118 int LYCharSetsDeclared(void)
1119 {
1120     UCInit();
1121 
1122     return UCInitialized;
1123 }
1124 
1125 #ifdef USE_CHARSET_CHOICE
init_charset_subsets(void)1126 void init_charset_subsets(void)
1127 {
1128     int i, n;
1129     int cur_display = 0;
1130     int cur_assumed = 0;
1131 
1132     /* add them to displayed values */
1133     charset_subsets[UCLYhndl_for_unspec].hide_assumed = FALSE;
1134     charset_subsets[current_char_set].hide_display = FALSE;
1135 
1136 #ifndef ALL_CHARSETS_IN_O_MENU_SCREEN
1137     /*all this stuff is for supporting old menu screen... */
1138     for (i = 0; i < LYNumCharsets; ++i) {
1139 	if (charset_subsets[i].hide_display == FALSE) {
1140 	    n = cur_display++;
1141 	    if (i == current_char_set)
1142 		displayed_display_charset_idx = n;
1143 	    display_charset_map[n] = i;
1144 	    display_charset_choices[n] = LYchar_set_names[i];
1145 	}
1146 	if (charset_subsets[i].hide_assumed == FALSE) {
1147 	    n = cur_assumed++;
1148 	    assumed_doc_charset_map[n] = i;
1149 	    assumed_charset_choices[n] = LYCharSet_UC[i].MIMEname;
1150 	    charset_subsets[i].assumed_idx = n;
1151 	}
1152 	display_charset_choices[cur_display] = NULL;
1153 	assumed_charset_choices[cur_assumed] = NULL;
1154     }
1155 #endif
1156 }
1157 #endif /* USE_CHARSET_CHOICE */
1158