lynx2.9.0dev.9/src/LYCharSets.c

/*
 * $LynxId: LYCharSets.c,v 1.71 2021/06/29 22:01:12 tom Exp $
 */
#include <HTUtils.h>
#include <HTCJK.h>
#include <HTMLDTD.h>

#include <LYGlobalDefs.h>
#include <UCMap.h>
#include <UCdomap.h>
#include <UCDefs.h>
#include <LYCharSets.h>
#include <GridText.h>
#include <LYCurses.h>
#include <LYStrings.h>

#include <LYLeaks.h>

HTkcode kanji_code = NOKANJI;
BOOLEAN LYHaveCJKCharacterSet = FALSE;
BOOLEAN DisplayCharsetMatchLocale = TRUE;
BOOL force_old_UCLYhndl_on_reload = FALSE;
int forced_UCLYhdnl;
int LYNumCharsets = 0;		/* Will be initialized later by UC_Register. */
int current_char_set = -1;	/* will be initialized later in LYMain.c */
int linedrawing_char_set = -1;
STRING2PTR p_entity_values = NULL;	/* Pointer, for HTML_put_entity() */

			      /* obsolete and probably not used(???)        */
			      /* will be initialized in HTMLUseCharacterSet */
#ifdef USE_CHARSET_CHOICE
charset_subset_t charset_subsets[MAXCHARSETS];
BOOL custom_display_charset = FALSE;
BOOL custom_assumed_doc_charset = FALSE;

#ifndef ALL_CHARSETS_IN_O_MENU_SCREEN
int display_charset_map[MAXCHARSETS];
int assumed_doc_charset_map[MAXCHARSETS];

const char *display_charset_choices[MAXCHARSETS + 1];
const char *assumed_charset_choices[MAXCHARSETS + 1];
int displayed_display_charset_idx;
#endif
#endif /* USE_CHARSET_CHOICE */

/*
 * New character sets now declared with UCInit() in UCdomap.c
 *
 * INSTRUCTIONS for adding new character sets which do not have
 *		Unicode tables now in UCdomap.h
 *
 *
 * [We hope you need not correct/add old-style mapping below as in ISO_LATIN1[]
 * or SevenBitApproximations[] any more - it works now via new chartrans
 * mechanism, but kept for compatibility only:  we should cleanup the stuff,
 * but this is not so easy...]
 *
 * Currently we only declare some charset's properties here (such as MIME
 * names, etc.), it does not include real mapping.
 *
 * There is a place marked "Add your new character sets HERE" in this file.
 * Make up a character set and add it in the same style as the ISO_LATIN1 set
 * below, giving it a unique name.
 *
 * Add the name of the set to LYCharSets.  Similarly add the appropriate
 * information to the tables below:  LYchar_set_names, LYCharSet_UC,
 * LYlowest_eightbit.  These 4 tables all MUST have the same order.  (And this
 * is the order you will see in Lynx Options Menu, which is why few
 * unicode-based charsets are listed here).
 *
 */

/*	Entity values -- for ISO Latin 1 local representation
 *
 *	This MUST match exactly the table referred to in the DTD!
 */
static const char *ISO_Latin1[] =
{
    "\306",			/* capital AE diphthong (ligature) (&#198;) - AElig */
    "\301",			/* capital A, acute accent (&#193;) - Aacute */
    "\302",			/* capital A, circumflex accent (&#194;) - Acirc */
    "\300",			/* capital A, grave accent (&#192;) - Agrave */
    "\305",			/* capital A, ring - Aring (&#197;) */
    "\303",			/* capital A, tilde - Atilde (&#195;) */
    "\304",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
    "\307",			/* capital C, cedilla - Ccedil (&#199;) */
    "\320",			/* capital Eth or D with stroke (&#208;) - Dstrok */
    "\320",			/* capital Eth, Icelandic (&#208;) - ETH */
    "\311",			/* capital E, acute accent (&#201;) - Eacute */
    "\312",			/* capital E, circumflex accent (&#202;) - Ecirc */
    "\310",			/* capital E, grave accent (&#200;) - Egrave */
    "\313",			/* capital E, dieresis or umlaut mark (&#203;) - Euml */
    "\315",			/* capital I, acute accent (&#205;) - Iacute */
    "\316",			/* capital I, circumflex accent (&#206;) - Icirc */
    "\314",			/* capital I, grave accent (&#204;) - Igrave */
    "\317",			/* capital I, dieresis or umlaut mark (&#207;) - Iuml */
    "\321",			/* capital N, tilde (&#209;) - Ntilde */
    "\323",			/* capital O, acute accent (&#211;) - Oacute */
    "\324",			/* capital O, circumflex accent (&#212;) - Ocirc */
    "\322",			/* capital O, grave accent (&#210;) - Ograve */
    "\330",			/* capital O, slash (&#216;) - Oslash */
    "\325",			/* capital O, tilde (&#213;) - Otilde */
    "\326",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
    "\336",			/* capital THORN, Icelandic (&#222;) - THORN */
    "\332",			/* capital U, acute accent (&#218;) - Uacute */
    "\333",			/* capital U, circumflex accent (&#219;) - Ucirc */
    "\331",			/* capital U, grave accent (&#217;) - Ugrave */
    "\334",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
    "\335",			/* capital Y, acute accent (&#221;) - Yacute */
    "\341",			/* small a, acute accent (&#225;) - aacute */
    "\342",			/* small a, circumflex accent (&#226;) - acirc */
    "\264",			/* spacing acute (&#180;) - acute */
    "\346",			/* small ae diphthong (ligature) (&#230;) - aelig */
    "\340",			/* small a, grave accent (&#224;) - agrave */
    "\046",			/* ampersand (&#38;) - amp */
    "\345",			/* small a, ring (&#229;) - aring */
    "\343",			/* small a, tilde (&#227;) - atilde */
    "\344",			/* small a, dieresis or umlaut mark (&#228;) - auml */
    "\246",			/* broken vertical bar (&#166;) - brkbar */
    "\246",			/* broken vertical bar (&#166;) - brvbar */
    "\347",			/* small c, cedilla (&#231;) - ccedil */
    "\270",			/* spacing cedilla (&#184;) - cedil */
    "\242",			/* cent sign (&#162;) - cent */
    "\251",			/* copyright sign (&#169;) - copy */
    "\244",			/* currency sign (&#164;) - curren */
    "\260",			/* degree sign (&#176;) - deg */
    "\250",			/* spacing dieresis (&#168;) - die */
    "\367",			/* division sign (&#247;) - divide */
    "\351",			/* small e, acute accent (&#233;) - eacute */
    "\352",			/* small e, circumflex accent (&#234;) - ecirc */
    "\350",			/* small e, grave accent (&#232;) - egrave */
    "-",			/* dash the width of emsp - emdash */
    "\002",			/* emsp, em space - not collapsed NEVER CHANGE THIS - emsp */
    "-",			/* dash the width of ensp - endash */
    "\002",			/* ensp, en space - not collapsed NEVER CHANGE THIS - ensp */
    "\360",			/* small eth, Icelandic (&#240;) - eth */
    "\353",			/* small e, dieresis or umlaut mark (&#235;) - euml */
    "\275",			/* fraction 1/2 (&#189;) - frac12 */
    "\274",			/* fraction 1/4 (&#188;) - frac14 */
    "\276",			/* fraction 3/4 (&#190;) - frac34 */
    "\076",			/* greater than (&#62;) - gt */
    "\257",			/* spacing macron (&#175;) - hibar */
    "\355",			/* small i, acute accent (&#237;) - iacute */
    "\356",			/* small i, circumflex accent (&#238;) - icirc */
    "\241",			/* inverted exclamation mark (&#161;) - iexcl */
    "\354",			/* small i, grave accent (&#236;) - igrave */
    "\277",			/* inverted question mark (&#191;) - iquest */
    "\357",			/* small i, dieresis or umlaut mark (&#239;) - iuml */
    "\253",			/* angle quotation mark, left (&#171;) - laquo */
    "\074",			/* less than (&#60;) - lt */
    "\257",			/* spacing macron (&#175;) - macr */
    "-",			/* dash the width of emsp - mdash */
    "\265",			/* micro sign (&#181;) - micro */
    "\267",			/* middle dot (&#183;) - middot */
    "\001",			/* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
    "-",			/* dash the width of ensp - ndash */
    "\254",			/* negation sign (&#172;) - not */
    "\361",			/* small n, tilde (&#241;) - ntilde */
    "\363",			/* small o, acute accent (&#243;) - oacute */
    "\364",			/* small o, circumflex accent (&#244;) - ocirc */
    "\362",			/* small o, grave accent (&#242;) - ograve */
    "\252",			/* feminine ordinal indicator (&#170;) - ordf */
    "\272",			/* masculine ordinal indicator (&#186;) - ordm */
    "\370",			/* small o, slash (&#248;) - oslash */
    "\365",			/* small o, tilde (&#245;) - otilde */
    "\366",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
    "\266",			/* paragraph sign (&#182;) - para */
    "\261",			/* plus-or-minus sign (&#177;) - plusmn */
    "\243",			/* pound sign (&#163;) - pound */
    "\042",			/* quote '"' (&#34;) - quot */
    "\273",			/* angle quotation mark, right (&#187;) - raquo */
    "\256",			/* circled R registered sign (&#174;) - reg */
    "\247",			/* section sign (&#167;) - sect */
    "\007",			/* soft hyphen (&#173;) NEVER CHANGE THIS - shy */
    "\271",			/* superscript 1 (&#185;) - sup1 */
    "\262",			/* superscript 2 (&#178;) - sup2 */
    "\263",			/* superscript 3 (&#179;) - sup3 */
    "\337",			/* small sharp s, German (sz ligature) (&#223;) - szlig */
    "\002",			/* thin space - not collapsed NEVER CHANGE THIS - thinsp */
    "\376",			/* small thorn, Icelandic (&#254;) - thorn */
    "\327",			/* multiplication sign (&#215;) - times */
    "(TM)",			/* circled TM trade mark sign (&#8482;) - trade */
    "\372",			/* small u, acute accent (&#250;) - uacute */
    "\373",			/* small u, circumflex accent (&#251;) - ucirc */
    "\371",			/* small u, grave accent (&#249;) - ugrave */
    "\250",			/* spacing dieresis (&#168;) - uml */
    "\374",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
    "\375",			/* small y, acute accent (&#253;) - yacute */
    "\245",			/* yen sign (&#165;) - yen */
    "\377",			/* small y, dieresis or umlaut mark (&#255;) - yuml */
};

/*	Entity values -- 7 bit character approximations
 *
 *	This MUST match exactly the table referred to in the DTD!
 */
const char *SevenBitApproximations[] =
{
    "AE",			/* capital AE diphthong (ligature) (&#198;) - AElig */
    "A",			/* capital A, acute accent (&#193;) - Aacute */
    "A",			/* capital A, circumflex accent (&#194;) - Acirc */
    "A",			/* capital A, grave accent (&#192;) - Agrave */
    "A",			/* capital A, ring - Aring (&#197;) */
    "A",			/* capital A, tilde - Atilde (&#195;) */
#ifdef LY_UMLAUT
    "Ae",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
#else
    "A",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
#endif				/* LY_UMLAUT */
    "C",			/* capital C, cedilla (&#199;) - Ccedil */
    "Dj",			/* capital D with stroke (&#208;) - Dstrok */
    "DH",			/* capital Eth, Icelandic (&#208;) - ETH */
    "E",			/* capital E, acute accent (&#201;) - Eacute */
    "E",			/* capital E, circumflex accent (&#202;) - Ecirc */
    "E",			/* capital E, grave accent (&#200;) - Egrave */
    "E",			/* capital E, dieresis or umlaut mark (&#203;) - Euml */
    "I",			/* capital I, acute accent (&#205;) - Iacute */
    "I",			/* capital I, circumflex accent (&#206;) - Icirc */
    "I",			/* capital I, grave accent (&#204;) - Igrave */
    "I",			/* capital I, dieresis or umlaut mark (&#207;) - Iuml */
    "N",			/* capital N, tilde - Ntilde (&#209;) */
    "O",			/* capital O, acute accent (&#211;) - Oacute */
    "O",			/* capital O, circumflex accent (&#212;) - Ocirc */
    "O",			/* capital O, grave accent (&#210;) - Ograve */
    "O",			/* capital O, slash (&#216;) - Oslash */
    "O",			/* capital O, tilde (&#213;) - Otilde */
#ifdef LY_UMLAUT
    "Oe",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
#else
    "O",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
#endif				/* LY_UMLAUT */
    "P",			/* capital THORN, Icelandic (&#222;) - THORN */
    "U",			/* capital U, acute accent (&#218;) - Uacute */
    "U",			/* capital U, circumflex accent (&#219;) - Ucirc */
    "U",			/* capital U, grave accent (&#217;) - Ugrave */
#ifdef LY_UMLAUT
    "Ue",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
#else
    "U",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
#endif				/* LY_UMLAUT */
    "Y",			/* capital Y, acute accent (&#221;) - Yacute */
    "a",			/* small a, acute accent (&#225;) - aacute */
    "a",			/* small a, circumflex accent (&#226;) - acirc */
    "'",			/* spacing acute (&#180;) - acute */
    "ae",			/* small ae diphthong (ligature) (&#230;) - aelig */
    "`a",			/* small a, grave accent (&#232;) - agrave */
    "&",			/* ampersand (&#38;) - amp */
    "a",			/* small a, ring (&#229;) - aring */
    "a",			/* small a, tilde (&#227;) - atilde */
#ifdef LY_UMLAUT
    "ae",			/* small a, dieresis or umlaut mark (&#228;) - auml */
#else
    "a",			/* small a, dieresis or umlaut mark (&#228;) - auml */
#endif				/* LY_UMLAUT */
    "|",			/* broken vertical bar (&#166;) - brkbar */
    "|",			/* broken vertical bar (&#166;) - brvbar */
    "c",			/* small c, cedilla (&#231;) - ccedil */
    ",",			/* spacing cedilla (&#184;) - cedil */
    "-c-",			/* cent sign (&#162;) - cent */
    "(c)",			/* copyright sign (&#169;) - copy */
    "CUR",			/* currency sign (&#164;) - curren */
    "DEG",			/* degree sign (&#176;) - deg */
    "\042",			/* spacing dieresis (&#168;) - die */
    "/",			/* division sign (&#247;) - divide */
    "e",			/* small e, acute accent (&#233;) - eacute */
    "e",			/* small e, circumflex accent (&#234;) - ecirc */
    "e",			/* small e, grave accent (&#232;) - egrave */
    "-",			/* dash the width of emsp - emdash */
    "\002",			/* emsp NEVER CHANGE THIS - emsp */
    "-",			/* dash the width of ensp - endash */
    "\002",			/* ensp NEVER CHANGE THIS - ensp */
    "dh",			/* small eth, Icelandic eth (&#240;) */
    "e",			/* small e, dieresis or umlaut mark (&#235;) - euml */
    " 1/2",			/* fraction 1/2 (&#189;) - frac12 */
    " 1/4",			/* fraction 1/4 (&#188;) - frac14 */
    " 3/4",			/* fraction 3/4 (&#190;) - frac34 */
    ">",			/* greater than (&#62;) - gt */
    "-",			/* spacing macron (&#175;) - hibar */
    "i",			/* small i, acute accent (&#237;) - iacute */
    "i",			/* small i, circumflex accent (&#238;) - icirc */
    "!",			/* inverted exclamation mark (&#161;) - iexcl */
    "`i",			/* small i, grave accent (&#236;) - igrave */
    "?",			/* inverted question mark (&#191;) - iquest */
    "i",			/* small i, dieresis or umlaut mark (&#239;) - iuml */
    "<<",			/* angle quotation mark, left (&#171;) - laquo */
    "<",			/* less than - lt (&#60;) */
    "-",			/* spacing macron (&#175;) - macr */
    "-",			/* dash the width of emsp - mdash */
    "u",			/* micro sign (&#181;) - micro */
    ".",			/* middle dot (&#183;) - middot */
    "\001",			/* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
    "-",			/* dash the width of ensp - ndash */
    "NOT",			/* negation sign (&#172;) - not */
    "n",			/* small n, tilde (&#241;) - ntilde */
    "o",			/* small o, acute accent (&#243;) - oacute */
    "o",			/* small o, circumflex accent (&#244;) - ocirc */
    "o",			/* small o, grave accent (&#242;) - ograve */
    "-a",			/* feminine ordinal indicator (&#170;) - ordf */
    "-o",			/* masculine ordinal indicator (&#186;) - ordm */
    "o",			/* small o, slash (&#248;) - oslash */
    "o",			/* small o, tilde (&#245;) - otilde */
#ifdef LY_UMLAUT
    "oe",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
#else
    "o",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
#endif				/* LY_UMLAUT */
    "P:",			/* paragraph sign (&#182;) - para */
    "+-",			/* plus-or-minus sign (&#177;) - plusmn */
    "-L-",			/* pound sign (&#163;) - pound */
    "\"",			/* quote '"' (&#34;) - quot */
    ">>",			/* angle quotation mark, right (&#187;) - raquo */
    "(R)",			/* circled R registered sign (&#174;) - reg */
    "S:",			/* section sign (&#167;) - sect */
    "\007",			/* soft hyphen (&#173;) NEVER CHANGE THIS - shy */
    "^1",			/* superscript 1 (&#185;) - sup1 */
    "^2",			/* superscript 2 (&#178;) - sup2 */
    "^3",			/* superscript 3 (&#179;) - sup3 */
    "ss",			/* small sharp s, German (sz ligature) (&#223;) - szlig */
    "\002",			/* thin space - not collapsed NEVER CHANGE THIS - thinsp */
    "p",			/* small thorn, Icelandic (&#254;) - thorn */
    "*",			/* multiplication sign (&#215;) - times */
    "(TM)",			/* circled TM trade mark sign (&#8482;) - trade */
    "u",			/* small u, acute accent (&#250;) - uacute */
    "u",			/* small u, circumflex accent (&#251;) - ucirc */
    "u",			/* small u, grave accent (&#249;) - ugrave */
    "\042",			/* spacing dieresis (&#168;) - uml */
#ifdef LY_UMLAUT
    "ue",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
#else
    "u",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
#endif				/* LY_UMLAUT */
    "y",			/* small y, acute accent (&#253;) - yacute */
    "YEN",			/* yen sign (&#165;) - yen */
    "y",			/* small y, dieresis or umlaut mark (&#255;) - yuml */
};

/*
 * Add your new character sets HERE (but only if you can't construct Unicode
 * tables for them).  - FM
 */

/*
 * Add the array name to LYCharSets
 */
STRING2PTR LYCharSets[MAXCHARSETS] =
{
    ISO_Latin1,			/* ISO Latin 1          */
    SevenBitApproximations,	/* 7 Bit Approximations */
};

/*
 * Add the name that the user will see below.  The order of LYCharSets and
 * LYchar_set_names MUST be the same
 */
const char *LYchar_set_names[MAXCHARSETS + 1] =
{
    "Western (ISO-8859-1)",
    "7 bit approximations (US-ASCII)",
    (char *) 0
};

/*
 * Associate additional pieces of info with each of the charsets listed above.
 * Will be automatically modified (and extended) by charset translations which
 * are loaded using the chartrans mechanism.  Most important piece of info to
 * put here is a MIME charset name.  Used for chartrans (see UCDefs.h).  The
 * order of LYCharSets and LYCharSet_UC MUST be the same.
 *
 * Note that most of the charsets added by the new mechanism in src/chrtrans
 * don't show up here at all.  They don't have to.
 */
LYUCcharset LYCharSet_UC[MAXCHARSETS] =
{
  /*
   * Zero position placeholder and HTMLGetEntityUCValue() reference.  - FM
   */
    {-1, "iso-8859-1", UCT_ENC_8BIT, 0,
     UCT_REP_IS_LAT1,
     UCT_CP_IS_LAT1, UCT_R_LAT1, UCT_R_LAT1},

  /*
   * Placeholders for Unicode tables.  - FM
   */
    {-1, "us-ascii", UCT_ENC_7BIT, 0,
     UCT_REP_SUBSETOF_LAT1,
     UCT_CP_SUBSETOF_LAT1, UCT_R_ASCII, UCT_R_ASCII},

};

/*
 * Add the code of the the lowest character with the high bit set that can be
 * directly displayed.  The order of LYCharSets and LYlowest_eightbit MUST be
 * the same.
 *
 * (If charset have chartrans unicode table, LYlowest_eightbit will be
 * verified/modified anyway.)
 */
int LYlowest_eightbit[MAXCHARSETS] =
{
    160,			/* ISO Latin 1          */
    999,			/* 7 bit approximations */
};

/*
 * Function to set the handling of selected character sets based on the current
 * LYUseDefaultRawMode value.  - FM
 */
void HTMLSetCharacterHandling(int i)
{
    int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
    BOOLEAN LYRawMode_flag = LYRawMode;
    int UCLYhndl_for_unspec_flag = UCLYhndl_for_unspec;

    if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
	HTCJK = NOCJK;
	kanji_code = NOKANJI;
	if (i == chndl)
	    LYRawMode = LYUseDefaultRawMode;
	else
	    LYRawMode = (BOOL) (!LYUseDefaultRawMode);

	HTPassEightBitNum = (BOOL) ((LYCharSet_UC[i].codepoints & UCT_CP_SUPERSETOF_LAT1)
				    || (LYCharSet_UC[i].like8859 & UCT_R_HIGH8BIT));

	if (LYRawMode) {
	    HTPassEightBitRaw = (BOOL) (LYlowest_eightbit[i] <= 160);
	} else {
	    HTPassEightBitRaw = FALSE;
	}
	if (LYRawMode || i == chndl) {
	    HTPassHighCtrlRaw = (BOOL) (LYlowest_eightbit[i] <= 130);
	} else {
	    HTPassHighCtrlRaw = FALSE;
	}

	HTPassHighCtrlNum = FALSE;

    } else {			/* CJK encoding: */
	const char *mime = LYCharSet_UC[i].MIMEname;

	if (!strcmp(mime, "euc-cn")) {
	    HTCJK = CHINESE;
	    kanji_code = EUC;
	} else if (!strcmp(mime, "euc-jp")) {
	    HTCJK = JAPANESE;
	    kanji_code = EUC;
	} else if (!strcmp(mime, "shift_jis")) {
	    HTCJK = JAPANESE;
	    kanji_code = SJIS;
	} else if (!strcmp(mime, "euc-kr")) {
	    HTCJK = KOREAN;
	    kanji_code = EUC;
	} else if (!strcmp(mime, "big5")) {
	    HTCJK = TAIPEI;
	    kanji_code = EUC;
	}

	/* for any CJK: */
	if (!LYUseDefaultRawMode)
	    HTCJK = NOCJK;
	LYRawMode = (BOOL) (IS_CJK_TTY ? TRUE : FALSE);
	HTPassEightBitRaw = FALSE;
	HTPassEightBitNum = FALSE;
	HTPassHighCtrlRaw = (BOOL) (IS_CJK_TTY ? TRUE : FALSE);
	HTPassHighCtrlNum = FALSE;
    }

    /*
     * Comment for coding below:
     * UCLYhndl_for_unspec is "current" state with LYRawMode, but
     * UCAssume_MIMEcharset is independent from LYRawMode:  holds the history
     * and may be changed from 'O'ptions menu only.  - LP
     */
    if (LYRawMode) {
	UCLYhndl_for_unspec = i;	/* UCAssume_MIMEcharset not changed! */
    } else {
	if (chndl != i &&
	    (LYCharSet_UC[i].enc != UCT_ENC_CJK ||
	     LYCharSet_UC[chndl].enc != UCT_ENC_CJK)) {
	    UCLYhndl_for_unspec = chndl;	/* fall to UCAssume_MIMEcharset */
	} else {
	    UCLYhndl_for_unspec = LATIN1;	/* UCAssume_MIMEcharset not changed! */
	}
    }

#ifdef USE_SLANG
    if (LYlowest_eightbit[i] > 191) {
	/*
	 * Higher than this may output cntrl chars to screen.  - KW
	 */
	SLsmg_Display_Eight_Bit = 191;
    } else {
	SLsmg_Display_Eight_Bit = LYlowest_eightbit[i];
    }
#endif /* USE_SLANG */

    ena_csi(LYlowest_eightbit[current_char_set] > 155);

    /* some diagnostics */
    if (TRACE) {
	if (LYRawMode_flag != LYRawMode)
	    CTRACE((tfp,
		    "HTMLSetCharacterHandling: LYRawMode changed %s -> %s\n",
		    (LYRawMode_flag ? "ON" : "OFF"),
		    (LYRawMode ? "ON" : "OFF")));
	if (UCLYhndl_for_unspec_flag != UCLYhndl_for_unspec)
	    CTRACE((tfp,
		    "HTMLSetCharacterHandling: UCLYhndl_for_unspec changed %d -> %d\n",
		    UCLYhndl_for_unspec_flag,
		    UCLYhndl_for_unspec));
    }

    return;
}

/*
 * Function to set HTCJK based on "in" and "out" charsets.
 */
void Set_HTCJK(const char *inMIMEname,
	       const char *outMIMEname)
{
    /* need not check for synonyms: MIMEname's got from LYCharSet_UC */

    if (LYRawMode) {
	if ((!strcmp(inMIMEname, "euc-jp") ||
#ifdef USE_JAPANESEUTF8_SUPPORT
	     !strcmp(inMIMEname, "utf-8") ||
#endif
	     !strcmp(inMIMEname, "shift_jis")) &&
	    (!strcmp(outMIMEname, "euc-jp") ||
	     !strcmp(outMIMEname, "shift_jis"))) {
	    HTCJK = JAPANESE;
	} else if (!strcmp(inMIMEname, "euc-cn") &&
		   !strcmp(outMIMEname, "euc-cn")) {
	    HTCJK = CHINESE;
	} else if (!strcmp(inMIMEname, "big5") &&
		   !strcmp(outMIMEname, "big5")) {
	    HTCJK = TAIPEI;
	} else if (!strcmp(inMIMEname, "euc-kr") &&
		   !strcmp(outMIMEname, "euc-kr")) {
	    HTCJK = KOREAN;
	} else {
	    HTCJK = NOCJK;
	}
    } else {
	HTCJK = NOCJK;
    }
}

/*
 * Function to set the LYDefaultRawMode value based on the selected character
 * set.  - FM
 *
 * Currently unused:  the default value so obvious that LYUseDefaultRawMode
 * utilized directly by someone's mistake.  - LP
 */
static void HTMLSetRawModeDefault(int i)
{
    LYDefaultRawMode = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK);
    return;
}

/*
 * Function to set the LYUseDefaultRawMode value based on the selected
 * character set and the current LYRawMode value.  - FM
 */
void HTMLSetUseDefaultRawMode(int i,
			      int modeflag)
{
    if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {

	int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);

	if (i == chndl)
	    LYUseDefaultRawMode = (BOOLEAN) modeflag;
	else
	    LYUseDefaultRawMode = (BOOL) (!modeflag);
    } else			/* CJK encoding: */
	LYUseDefaultRawMode = (BOOLEAN) modeflag;

    return;
}

/*
 * Function to set the LYHaveCJKCharacterSet value based on the selected
 * character set.  - FM
 */
static void HTMLSetHaveCJKCharacterSet(int i)
{
    LYHaveCJKCharacterSet = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK);
    return;
}

/*
 * Function to set the DisplayCharsetMatchLocale value based on the selected
 * character set.  It is used in UPPER8 for 8bit case-insensitive search by
 * matching def7_uni.tbl images.  - LP
 */
static void HTMLSetDisplayCharsetMatchLocale(int i)
{
    BOOLEAN match;

    if (LYHaveCJKCharacterSet) {
	/*
	 * We have no intention to pass CJK via UCTransChar if that happened.
	 * Let someone from CJK correct this if necessary.
	 */
	DisplayCharsetMatchLocale = TRUE;	/* old-style */
	return;

    } else if (strncasecomp(LYCharSet_UC[i].MIMEname, "cp", 2) ||
	       strncasecomp(LYCharSet_UC[i].MIMEname, "windows", 7)) {
	/*
	 * Assume dos/windows displays usually on remote terminal, hence it
	 * rarely matches locale.  (In fact, MS Windows codepoints locale are
	 * never seen on UNIX).
	 */
	match = FALSE;
    } else {
	match = TRUE;		/* guess, but see below */

#if !defined(LOCALE)
	if (LYCharSet_UC[i].enc != UCT_ENC_UTF8)
	    /*
	     * Leave true for utf-8 display - the code doesn't deal very well
	     * with this case.  - kw
	     */
	    match = FALSE;
#else
	if (UCForce8bitTOUPPER) {
	    /*
	     * Force disable locale (from lynx.cfg)
	     */
	    match = FALSE;
	}
#endif
    }

    DisplayCharsetMatchLocale = match;
    return;
}

/*
 * lynx 2.8/2.7.2(and more early) compatibility code:  "human-readable" charset
 * names changes with time so we map that history names to MIME here to get old
 * lynx.cfg and (especially) .lynxrc always recognized.  Please update this
 * table when you change "fullname" of any present charset.
 */
typedef struct _names_pairs {
    const char *fullname;
    const char *MIMEname;
} names_pairs;
/* *INDENT-OFF* */
static const names_pairs OLD_charset_names[] =
{
    {"ISO Latin 1",		"iso-8859-1"},
    {"ISO Latin 2",             "iso-8859-2"},
    {"WinLatin1 (cp1252)",      "windows-1252"},
    {"DEC Multinational",       "dec-mcs"},
    {"Macintosh (8 bit)",       "macintosh"},
    {"NeXT character set",      "next"},
    {"KOI8-R Cyrillic",         "koi8-r"},
    {"Chinese",                 "euc-cn"},
    {"Japanese (EUC)",          "euc-jp"},
    {"Japanese (SJIS)",         "shift_jis"},
    {"Korean",                  "euc-kr"},
    {"Taipei (Big5)",           "big5"},
    {"Vietnamese (VISCII)",     "viscii"},
    {"7 bit approximations",    "us-ascii"},
    {"Transparent",             "x-transparent"},
    {"DosLatinUS (cp437)",      "cp437"},
    {"IBM PC character set",    "cp437"},
    {"DosLatin1 (cp850)",       "cp850"},
    {"IBM PC codepage 850",     "cp850"},
    {"DosLatin2 (cp852)",       "cp852"},
    {"PC Latin2 CP 852",        "cp852"},
    {"DosCyrillic (cp866)",     "cp866"},
    {"DosArabic (cp864)",       "cp864"},
    {"DosGreek (cp737)",        "cp737"},
    {"DosBaltRim (cp775)",      "cp775"},
    {"DosGreek2 (cp869)",       "cp869"},
    {"DosHebrew (cp862)",       "cp862"},
    {"WinLatin2 (cp1250)",      "windows-1250"},
    {"WinCyrillic (cp1251)",    "windows-1251"},
    {"WinGreek (cp1253)",       "windows-1253"},
    {"WinHebrew (cp1255)",      "windows-1255"},
    {"WinArabic (cp1256)",      "windows-1256"},
    {"WinBaltRim (cp1257)",     "windows-1257"},
    {"ISO Latin 3",             "iso-8859-3"},
    {"ISO Latin 4",             "iso-8859-4"},
    {"ISO 8859-5 Cyrillic",     "iso-8859-5"},
    {"ISO 8859-6 Arabic",       "iso-8859-6"},
    {"ISO 8859-7 Greek",        "iso-8859-7"},
    {"ISO 8859-8 Hebrew",       "iso-8859-8"},
    {"ISO-8859-8-I",            "iso-8859-8"},
    {"ISO-8859-8-E",            "iso-8859-8"},
    {"ISO 8859-9 (Latin 5)",    "iso-8859-9"},
    {"ISO 8859-10",             "iso-8859-10"},
    {"UNICODE UTF 8",           "utf-8"},
    {"RFC 1345 w/o Intro",      "mnemonic+ascii+0"},
    {"RFC 1345 Mnemonic",       "mnemonic"},
    {NULL, NULL},		/* terminated with NULL */
};
/* *INDENT-ON* */

/*
 * lynx 2.8/2.7.2 compatibility code:  read "character_set" parameter from
 * lynx.cfg and .lynxrc in both MIME name and "human-readable" name (old and
 * new style).  Returns -1 if not recognized.
 */
int UCGetLYhndl_byAnyName(char *value)
{
    int i;

    if (value == NULL)
	return -1;

    LYTrimTrailing(value);
    CTRACE((tfp, "UCGetLYhndl_byAnyName(%s)\n", value));

    /* search by name */
    for (i = 0; (i < MAXCHARSETS && LYchar_set_names[i]); i++) {
	if (!strcmp(value, LYchar_set_names[i])) {
	    return i;		/* OK */
	}
    }

    /* search by old name from 2.8/2.7.2 version */
    for (i = 0; (OLD_charset_names[i].fullname); i++) {
	if (!strcmp(value, OLD_charset_names[i].fullname)) {
	    return UCGetLYhndl_byMIME(OLD_charset_names[i].MIMEname);	/* OK */
	}
    }

    return UCGetLYhndl_byMIME(value);	/* by MIME */
}

/*
 * Entity names -- Ordered by ISO Latin 1 value.
 * ---------------------------------------------
 * For conversions of DECIMAL escaped entities.
 * Must be in order of ascending value.
 */
static const char *LYEntityNames[] =
{
/*	 NAME		   DECIMAL VALUE */
    "nbsp",			/* 160, non breaking space */
    "iexcl",			/* 161, inverted exclamation mark */
    "cent",			/* 162, cent sign */
    "pound",			/* 163, pound sign */
    "curren",			/* 164, currency sign */
    "yen",			/* 165, yen sign */
    "brvbar",			/* 166, broken vertical bar, (brkbar) */
    "sect",			/* 167, section sign */
    "uml",			/* 168, spacing dieresis */
    "copy",			/* 169, copyright sign */
    "ordf",			/* 170, feminine ordinal indicator */
    "laquo",			/* 171, angle quotation mark, left */
    "not",			/* 172, negation sign */
    "shy",			/* 173, soft hyphen */
    "reg",			/* 174, circled R registered sign */
    "hibar",			/* 175, spacing macron */
    "deg",			/* 176, degree sign */
    "plusmn",			/* 177, plus-or-minus sign */
    "sup2",			/* 178, superscript 2 */
    "sup3",			/* 179, superscript 3 */
    "acute",			/* 180, spacing acute (96) */
    "micro",			/* 181, micro sign */
    "para",			/* 182, paragraph sign */
    "middot",			/* 183, middle dot */
    "cedil",			/* 184, spacing cedilla */
    "sup1",			/* 185, superscript 1 */
    "ordm",			/* 186, masculine ordinal indicator */
    "raquo",			/* 187, angle quotation mark, right */
    "frac14",			/* 188, fraction 1/4 */
    "frac12",			/* 189, fraction 1/2 */
    "frac34",			/* 190, fraction 3/4 */
    "iquest",			/* 191, inverted question mark */
    "Agrave",			/* 192, capital A, grave accent */
    "Aacute",			/* 193, capital A, acute accent */
    "Acirc",			/* 194, capital A, circumflex accent */
    "Atilde",			/* 195, capital A, tilde */
    "Auml",			/* 196, capital A, dieresis or umlaut mark */
    "Aring",			/* 197, capital A, ring */
    "AElig",			/* 198, capital AE diphthong (ligature) */
    "Ccedil",			/* 199, capital C, cedilla */
    "Egrave",			/* 200, capital E, grave accent */
    "Eacute",			/* 201, capital E, acute accent */
    "Ecirc",			/* 202, capital E, circumflex accent */
    "Euml",			/* 203, capital E, dieresis or umlaut mark */
    "Igrave",			/* 204, capital I, grave accent */
    "Iacute",			/* 205, capital I, acute accent */
    "Icirc",			/* 206, capital I, circumflex accent */
    "Iuml",			/* 207, capital I, dieresis or umlaut mark */
    "ETH",			/* 208, capital Eth, Icelandic (or Latin2 Dstrok) */
    "Ntilde",			/* 209, capital N, tilde */
    "Ograve",			/* 210, capital O, grave accent */
    "Oacute",			/* 211, capital O, acute accent */
    "Ocirc",			/* 212, capital O, circumflex accent */
    "Otilde",			/* 213, capital O, tilde */
    "Ouml",			/* 214, capital O, dieresis or umlaut mark */
    "times",			/* 215, multiplication sign */
    "Oslash",			/* 216, capital O, slash */
    "Ugrave",			/* 217, capital U, grave accent */
    "Uacute",			/* 218, capital U, acute accent */
    "Ucirc",			/* 219, capital U, circumflex accent */
    "Uuml",			/* 220, capital U, dieresis or umlaut mark */
    "Yacute",			/* 221, capital Y, acute accent */
    "THORN",			/* 222, capital THORN, Icelandic */
    "szlig",			/* 223, small sharp s, German (sz ligature) */
    "agrave",			/* 224, small a, grave accent */
    "aacute",			/* 225, small a, acute accent */
    "acirc",			/* 226, small a, circumflex accent */
    "atilde",			/* 227, small a, tilde */
    "auml",			/* 228, small a, dieresis or umlaut mark */
    "aring",			/* 229, small a, ring */
    "aelig",			/* 230, small ae diphthong (ligature) */
    "ccedil",			/* 231, small c, cedilla */
    "egrave",			/* 232, small e, grave accent */
    "eacute",			/* 233, small e, acute accent */
    "ecirc",			/* 234, small e, circumflex accent */
    "euml",			/* 235, small e, dieresis or umlaut mark */
    "igrave",			/* 236, small i, grave accent */
    "iacute",			/* 237, small i, acute accent */
    "icirc",			/* 238, small i, circumflex accent */
    "iuml",			/* 239, small i, dieresis or umlaut mark */
    "eth",			/* 240, small eth, Icelandic */
    "ntilde",			/* 241, small n, tilde */
    "ograve",			/* 242, small o, grave accent */
    "oacute",			/* 243, small o, acute accent */
    "ocirc",			/* 244, small o, circumflex accent */
    "otilde",			/* 245, small o, tilde */
    "ouml",			/* 246, small o, dieresis or umlaut mark */
    "divide",			/* 247, division sign */
    "oslash",			/* 248, small o, slash */
    "ugrave",			/* 249, small u, grave accent */
    "uacute",			/* 250, small u, acute accent */
    "ucirc",			/* 251, small u, circumflex accent */
    "uuml",			/* 252, small u, dieresis or umlaut mark */
    "yacute",			/* 253, small y, acute accent */
    "thorn",			/* 254, small thorn, Icelandic */
    "yuml",			/* 255, small y, dieresis or umlaut mark */
};

/*
 * Function to return the entity names of ISO-8859-1 8-bit characters.  - FM
 */
const char *HTMLGetEntityName(UCode_t code)
{
#define IntValue code
    int MaxValue = (TABLESIZE(LYEntityNames) - 1);

    if (IntValue < 0 || IntValue > MaxValue) {
	return "";
    }

    return LYEntityNames[IntValue];
}

/*
 * Function to return the UCode_t (long int) value for entity names.  It
 * returns 0 if not found.
 *
 * unicode_entities[] handles all the names from old style entities[] too.
 * Lynx now calls unicode_entities[] only through this function:
 * HTMLGetEntityUCValue().  Note, we need not check for special characters here
 * in function or even before it, we should check them *after* invoking this
 * function, see put_special_unicodes() in SGML.c.
 *
 * In the future we will try to isolate all calls to entities[] in favor of new
 * unicode-based chartrans scheme.  - LP
 */
UCode_t HTMLGetEntityUCValue(const char *name)
{
#include <entities.h>

    UCode_t value = 0;
    size_t i, high, low;
    int diff = 0;
    size_t number_of_unicode_entities = TABLESIZE(unicode_entities);

    /*
     * Make sure we have a non-zero length name.  - FM
     */
    if (isEmpty(name))
	return (value);

    /*
     * Try UC_entity_info unicode_entities[].
     */
    for (low = 0, high = number_of_unicode_entities;
	 high > low;
	 diff < 0 ? (low = i + 1) : (high = i)) {
	/*
	 * Binary search.
	 */
	i = (low + (high - low) / 2);
	diff = AS_cmp(unicode_entities[i].name, name);	/* Case sensitive! */
	if (diff == 0) {
	    value = unicode_entities[i].code;
	    break;
	}
    }
    return (value);
}

/*
 * Original comment -
 * Assume these are Microsoft code points, inflicted on us by FrontPage.  - FM
 *
 * MS FrontPage uses syntax like &#153; in 128-159 range and doesn't follow
 * Unicode standards for this area.  Windows-1252 codepoints are assumed here.
 *
 * However see -
 * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0
 */
UCode_t LYcp1252ToUnicode(UCode_t code)
{
    if ((code == 1) ||
	(code > 127 && code < 160)) {
	switch (code) {
	case 1:
	    /*
	     * WHITE SMILING FACE
	     */
	    code = 0x263a;
	    break;
	case 128:
	    /*
	     * EURO currency sign
	     */
	    code = 0x20ac;
	    break;
	case 130:
	    /*
	     * SINGLE LOW-9 QUOTATION MARK (sbquo)
	     */
	    code = 0x201a;
	    break;
	case 131:
	    /*
	     * LATIN SMALL LETTER F WITH HOOK
	     */
	    code = 0x192;
	    break;
	case 132:
	    /*
	     * DOUBLE LOW-9 QUOTATION MARK (bdquo)
	     */
	    code = 0x201e;
	    break;
	case 133:
	    /*
	     * HORIZONTAL ELLIPSIS (hellip)
	     */
	    code = 0x2026;
	    break;
	case 134:
	    /*
	     * DAGGER (dagger)
	     */
	    code = 0x2020;
	    break;
	case 135:
	    /*
	     * DOUBLE DAGGER (Dagger)
	     */
	    code = 0x2021;
	    break;
	case 136:
	    /*
	     * MODIFIER LETTER CIRCUMFLEX ACCENT
	     */
	    code = 0x2c6;
	    break;
	case 137:
	    /*
	     * PER MILLE SIGN (permil)
	     */
	    code = 0x2030;
	    break;
	case 138:
	    /*
	     * LATIN CAPITAL LETTER S WITH CARON
	     */
	    code = 0x160;
	    break;
	case 139:
	    /*
	     * SINGLE LEFT-POINTING ANGLE QUOTATION MARK (lsaquo)
	     */
	    code = 0x2039;
	    break;
	case 140:
	    /*
	     * LATIN CAPITAL LIGATURE OE
	     */
	    code = 0x152;
	    break;
	case 142:
	    /*
	     * LATIN CAPITAL LETTER Z WITH CARON
	     */
	    code = 0x17d;
	    break;
	case 145:
	    /*
	     * LEFT SINGLE QUOTATION MARK (lsquo)
	     */
	    code = 0x2018;
	    break;
	case 146:
	    /*
	     * RIGHT SINGLE QUOTATION MARK (rsquo)
	     */
	    code = 0x2019;
	    break;
	case 147:
	    /*
	     * LEFT DOUBLE QUOTATION MARK (ldquo)
	     */
	    code = 0x201c;
	    break;
	case 148:
	    /*
	     * RIGHT DOUBLE QUOTATION MARK (rdquo)
	     */
	    code = 0x201d;
	    break;
	case 149:
	    /*
	     * BULLET (bull)
	     */
	    code = 0x2022;
	    break;
	case 150:
	    /*
	     * EN DASH (ndash)
	     */
	    code = 0x2013;
	    break;
	case 151:
	    /*
	     * EM DASH (mdash)
	     */
	    code = 0x2014;
	    break;
	case 152:
	    /*
	     * SMALL TILDE (tilde)
	     */
	    code = 0x02dc;
	    break;
	case 153:
	    /*
	     * TRADE MARK SIGN (trade)
	     */
	    code = 0x2122;
	    break;
	case 154:
	    /*
	     * LATIN SMALL LETTER S WITH CARON
	     */
	    code = 0x161;
	    break;
	case 155:
	    /*
	     * SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (rsaquo)
	     */
	    code = 0x203a;
	    break;
	case 156:
	    /*
	     * LATIN SMALL LIGATURE OE
	     */
	    code = 0x153;
	    break;
	case 158:
	    /*
	     * LATIN SMALL LETTER Z WITH CARON
	     */
	    code = 0x17e;
	    break;
	case 159:
	    /*
	     * LATIN CAPITAL LETTER Y WITH DIAERESIS
	     */
	    code = 0x178;
	    break;
	default:
	    /*
	     * Undefined (by convention, use the replacement character).
	     */
	    code = UCS_REPL;
	    break;
	}
    }
    return code;
}

/*
 * Function to select a character set and then set the character handling and
 * LYHaveCJKCharacterSet flag.  - FM
 */
void HTMLUseCharacterSet(int i)
{
    HTMLSetRawModeDefault(i);
    p_entity_values = LYCharSets[i];
    HTMLSetCharacterHandling(i);	/* set LYRawMode and CJK attributes */
    HTMLSetHaveCJKCharacterSet(i);
    HTMLSetDisplayCharsetMatchLocale(i);
    return;
}

/*
 * Initializer, calls initialization function for the CHARTRANS handling.  - KW
 */
int LYCharSetsDeclared(void)
{
    UCInit();

    return UCInitialized;
}

#ifdef USE_CHARSET_CHOICE
void init_charset_subsets(void)
{
    int i, n;
    int cur_display = 0;
    int cur_assumed = 0;

    /* add them to displayed values */
    charset_subsets[UCLYhndl_for_unspec].hide_assumed = FALSE;
    charset_subsets[current_char_set].hide_display = FALSE;

#ifndef ALL_CHARSETS_IN_O_MENU_SCREEN
    /*all this stuff is for supporting old menu screen... */
    for (i = 0; i < LYNumCharsets; ++i) {
	if (charset_subsets[i].hide_display == FALSE) {
	    n = cur_display++;
	    if (i == current_char_set)
		displayed_display_charset_idx = n;
	    display_charset_map[n] = i;
	    display_charset_choices[n] = LYchar_set_names[i];
	}
	if (charset_subsets[i].hide_assumed == FALSE) {
	    n = cur_assumed++;
	    assumed_doc_charset_map[n] = i;
	    assumed_charset_choices[n] = LYCharSet_UC[i].MIMEname;
	    charset_subsets[i].assumed_idx = n;
	}
	display_charset_choices[cur_display] = NULL;
	assumed_charset_choices[cur_assumed] = NULL;
    }
#endif
}
#endif /* USE_CHARSET_CHOICE */