1 /* 2 * Copyright (c) 1997-2005 Kazushi (Jam) Marukawa 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice in the documentation and/or other materials provided with 12 * the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 20 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 21 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 23 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 24 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 28 /* 29 * The design of data structure of jless 30 * 31 * We use char[] byte data and CHARSET[] character set data to represent 32 * multilingual text. We defined CHARSET following ISO 2022 technique. 33 * All characters represented in ISO 2022 can be stored in less without 34 * any destructive conversion. 35 * 36 * For example, less can read text files using JIS C 6226-1978, JIS X 37 * 0208-1983, and JIS X 0208:1990 character sets and output everything 38 * using their original character set while searching a character encoded 39 * by JIS X 0213:2004. Inside of less, it buffers all text files using 40 * their original character set, unifies them when matching with the 41 * searching character, and outputs using their original character sets. 42 * 43 * If less needs conversions when it outputs internal data, it converts 44 * them on the fly. 45 * 46 * On the other hand, text using SJIS or UJIS are buffered after 47 * conversion while less is reading input stream. 48 * 49 * In addition, UTF-8 is buffered as UTF-8. Less converts it to appropriate 50 * character set/sets on the fly. (UTF-8 is notimplemented yet). 51 */ 52 53 /* 54 * Definition of values to specify the character set. 55 * And definitions some well known character sets and a types of set. 56 */ 57 typedef unsigned short CHARSET; 58 59 /* 60 * The structure of CHARSET: 61 * 62 * 151413121110 9 8 7 6 5 4 3 2 1 0 63 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64 * |r| IRR |m|n| F | 65 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 66 * 67 * r: True if it is not the first byte of multi-byte characters. 68 * IRR: Identification of Revisions of Registered character sets (IRR). 69 * Read ISO-2022 for mode details. The value of IRR is ranged from 70 * 00/01 to 03/15. 00/00 means no IRR. IRR (from 00/01 to 03/15) 71 * is mapped to a code from 04/00 to 07/14 in ISO-2022. 72 * m: True if it is part of multi-byte characters. 73 * n: True if it is one of 96 or 96x96 graphic sets, otherwise it is one 74 * of 94 or 94x94 graphic sets. 75 * F: Final byte (F). This select graphi sets of characters. 76 * The value of F is ranged from 00/00 to 04/14. Such values are coded 77 * from 03/00 to 07/14 in ISO-2022. 78 */ 79 80 #define REST_MASK 0x8000 /* r */ 81 #define CSISHEAD(cs) (!((cs) & REST_MASK)) 82 #define CSISREST(cs) ((cs) & REST_MASK) 83 84 #define IRR_MASK 0x7e00 /* IRR */ 85 #define IRR_SHIFT 9 86 #define CS2IRR(cs) (((cs) & IRR_MASK) >> IRR_SHIFT) 87 #define IRR2CS(irr) (((irr) << IRR_SHIFT) & IRR_MASK) 88 89 #define CODE_MASK 0x003f /* coded IRR in ISO 2022 */ 90 #define CODE_DIFF 0x0040 91 #define IRR2CODE(irr) ((((irr) - 1) & CODE_MASK) + CODE_DIFF) 92 #define CODE2IRR(code) ((((code) - CODE_DIFF) & CODE_MASK) + 1) 93 94 #define TYPE_94_CHARSET 0x0000 /* m & n */ 95 #define TYPE_96_CHARSET 0x0080 96 #define TYPE_94N_CHARSET 0x0100 97 #define TYPE_96N_CHARSET 0x0180 98 #define TYPE_MASK 0x0180 99 #define CS2TYPE(cs) ((cs) & TYPE_MASK) 100 #define TYPE2CS(type) ((type) & TYPE_MASK) 101 102 #define FT_MASK 0x007f /* F */ 103 #define FT_DIFF 0x0030 104 #define CS2FT(cs) (((cs) & FT_MASK) + FT_DIFF) 105 #define FT2CS(ft) (((ft) - FT_DIFF) & FT_MASK) 106 107 /* 108 * Each character sets is represented by IRR, TYPE and FT. 109 */ 110 #define CHARSET_MASK (IRR_MASK | TYPE_MASK | FT_MASK) 111 #define CS2CHARSET(cs) ((cs) & CHARSET_MASK) 112 113 /* 114 * There is a reserved empty set in every type of charset. 07/14. 115 * So we cannot use (CS2CHARSET(cs) == WRONGCS) to check it. 116 */ 117 #define CSISWRONG(cs) (CS2FT(cs) == '~') 118 119 /* 120 * List of representative character sets. 121 */ 122 #define ASCII (TYPE_94_CHARSET | FT2CS('B')) 123 #define WRONGCS (TYPE_94_CHARSET | FT2CS('~')) 124 #define WRONG_ESC (IRR2CS(1) | TYPE_94_CHARSET | FT2CS('~')) 125 #define WRONGUCS_H (IRR2CS(2) | TYPE_94N_CHARSET | FT2CS('~')) 126 #define WRONGUCS_T (IRR2CS(3) | TYPE_94N_CHARSET | FT2CS('~')) 127 #define WRONGUCS_M (IRR2CS(4) | TYPE_94N_CHARSET | FT2CS('~')) 128 #if ISO 129 #define JISX0201KANA (TYPE_94_CHARSET | FT2CS('I')) 130 #define JISX0201ROMAN (TYPE_94_CHARSET | FT2CS('J')) 131 #define LATIN1 (TYPE_96_CHARSET | FT2CS('A')) 132 #define LATIN2 (TYPE_96_CHARSET | FT2CS('B')) 133 #define LATIN3 (TYPE_96_CHARSET | FT2CS('C')) 134 #define LATIN4 (TYPE_96_CHARSET | FT2CS('D')) 135 #define CYRILLIC (TYPE_96_CHARSET | FT2CS('L')) 136 #define ARABIC (TYPE_96_CHARSET | FT2CS('G')) 137 #define GREEK (TYPE_96_CHARSET | FT2CS('F')) 138 #define HEBREW (TYPE_96_CHARSET | FT2CS('H')) 139 #define LATIN5 (TYPE_96_CHARSET | FT2CS('M')) 140 #define LATIN6 (TYPE_96_CHARSET | FT2CS('V')) 141 #define THAI (TYPE_96_CHARSET | FT2CS('T')) 142 #define LATIN7 (TYPE_96_CHARSET | FT2CS('Y')) 143 #define LATIN8 (TYPE_96_CHARSET | FT2CS('_')) 144 #define LATIN9 (TYPE_96_CHARSET | FT2CS('b')) 145 #define LATIN10 (TYPE_96_CHARSET | FT2CS('f')) 146 /* 147 * JISX0208_78KANJI means JIS C 6226-1978 148 * JISX0208KANJI means JIS X 0208-1983 (same as JIS C 6226-1983) 149 * This is similar to JIS C 6226-1978. Several characters are moved 150 * or exchanged in code space. Conversion table is available in unify.c. 151 * JISX0208_90KANJI means JIS X 0208:1990 (same as JIS X 0208-1990) 152 * This is super set of JIS X 0208-1983. Two characters are added from 153 * JIS X 0208-1983. In addition, this covers JIS X 0208:1997 too. 154 * They have the same code space. The difference between them is 155 * historical description. JIS X 0208:1997 defines ans describes 156 * all characters. 157 * JISX0213KANJI1 means JIS X 0213:2000 plane 1 158 * This is super set of JIS X 0208:1990 and JIS X 0208:1997. Several 159 * characters are added. 160 * JISX02132004KANJI1 means JIS X 0213:2004 plane 1 161 * This is super set of JIS X 0213:2000. 10 characters are added. 162 * And, glyph of several characters is modified. 163 * 164 * JISX0212KANJISUP means JIS X 0212:1990 (same as JIS X 0212-1990) 165 * JISX0213KANJI2 means JIS X 0213:2000 plane 1 166 * JISX02132004KANJI2 means JIS X 0213:2004 plane 1 167 * 168 * JISX0201KANA means JIS X 0201:1976 right plane (same as JIS X 0201-1976 169 * and JIS C 6220-1976 right plane) 170 * JISX0201ROMAN means JIS X 0201:1976 left plane (same as JIS X 0201-1976 171 * and JIS C 6220-1976 left plane) 172 * These cover JIS X 0201:1997 too. They have the same code space. 173 * The difference between them is historical description. 174 * JIS X 0201:1997 defines ans describes all characters. 175 */ 176 #define JISX0208_78KANJI (TYPE_94N_CHARSET | FT2CS('@')) 177 #define GB2312 (TYPE_94N_CHARSET | FT2CS('A')) 178 #define JISX0208KANJI (TYPE_94N_CHARSET | FT2CS('B')) 179 #define JISX0208_90KANJI (IRR2CS(1) | TYPE_94N_CHARSET | FT2CS('B')) 180 #define KSC5601 (TYPE_94N_CHARSET | FT2CS('C')) 181 #define JISX0212KANJISUP (TYPE_94N_CHARSET | FT2CS('D')) 182 #define JISX0213KANJI1 (TYPE_94N_CHARSET | FT2CS('O')) 183 #define JISX0213KANJI2 (TYPE_94N_CHARSET | FT2CS('P')) 184 #define JISX02132004KANJI1 (TYPE_94N_CHARSET | FT2CS('Q')) 185 #define JISX02132004KANJI2 (TYPE_94N_CHARSET | FT2CS('P')) 186 187 #define UTF8Z (IRR2CS(0) | TYPE_94N_CHARSET | (FT_MASK-2)) 188 #define UTF8 (IRR2CS(1) | TYPE_94N_CHARSET | (FT_MASK-2)) 189 #define UTF8W (IRR2CS(2) | TYPE_94N_CHARSET | (FT_MASK-2)) 190 #if JAPANESE 191 /* 192 * Special number for Japanese code set. Only input_set use following with 193 * above definitions. The 07/15 or 07/14 are not valid for F. So, we are 194 * using them as indications of special character sets. 195 * 196 * SJIS contains ASCII, JIS X 0201:1976 right plane, and JIS X 0208:1997 197 * UJIS contains ASCII, JIS X 0201:1976, and JIS X 0208:1997 198 * SJIS2000 contains ASCII, JIS X 0201:1976 right plane, and JIS X 0213:2000 199 * UJIS2000 contains ASCII, JIS X 0201:1976, JIS X 0213:2000, 200 * and JIS X 0212:1990 201 * SJIS2004 contains ASCII, JIS X 0201:1976 right plane, and JIS X 0213:2004 202 * UJIS2004 contains ASCII, JIS X 0201:1976, JIS X 0213:2004, 203 * and JIS X 0212:1990 204 */ 205 #define SJIS (IRR2CS(0) | TYPE_94N_CHARSET | FT_MASK) 206 #define SJIS2000 (IRR2CS(1) | TYPE_94N_CHARSET | FT_MASK) 207 #define SJIS2004 (IRR2CS(2) | TYPE_94N_CHARSET | FT_MASK) 208 #define CP932 (IRR2CS(3) | TYPE_94N_CHARSET | FT_MASK) 209 #define UJIS (IRR2CS(0) | TYPE_94N_CHARSET | (FT_MASK-1)) 210 #define UJIS2000 (IRR2CS(1) | TYPE_94N_CHARSET | (FT_MASK-1)) 211 #define UJIS2004 (IRR2CS(2) | TYPE_94N_CHARSET | (FT_MASK-1)) 212 213 214 /* 215 * Make SJIS/UJIS character set from mp. 216 * 217 * SJIS and UJIS are using only fixed number of plane sets. Therefore, 218 * it is impossible to use JIS X 0208:1990 and JIS X 0213:2004 at the 219 * same time. SJIS use only one of them. And, it is declared by 220 * MULBUF->io.right. This function constructs appropriate SJIS 221 * character set number from it. 222 * 223 * Usage: sjiscs = MAKESUJISCS(mp, SJIS); 224 * ujiscs = MAKESUJISCS(mp, UJIS); 225 */ 226 #define MAKESUJISCS(mp,su) \ 227 ((su)| (((mp)->io.right&CJISX0213_2004)?IRR2CS(2):\ 228 (((mp)->io.right&CJISX0213_2000)?IRR2CS(1):0))) 229 #endif 230 #endif 231 232 /* 233 * List of special characters and character set for it. 234 * 235 * A terminator of string with character set is represented by 236 * both a NULCH and a NULLCS. A padding character in string with 237 * character set is represented by both a PADCH and a NULLCS. A 238 * binary data '\0' and '\1' are represented by both '\0' and a 239 * WRONGCS, and both '\1' and a WRONGCS respectively. 240 */ 241 #define NULCH ('\0') 242 #define PADCH ('\1') 243 #define NULLCS (ASCII) 244 245 /* 246 * Macros for easy checking. 247 */ 248 #define CSISASCII(cs) (CS2CHARSET(cs) == ASCII) 249 #define CSISNULLCS(cs) (CS2CHARSET(cs) == NULLCS) 250 251 252 /* 253 * Definition of values to specify the character set and character. 254 */ 255 typedef int CHARVAL; 256 257 #define MAKECV(ch, cs) (((cs) << 8 * sizeof(char)) | ch) 258 #define CV2CH(cv) ((cv) & ((1 << 8 * sizeof(char)) - 1)) 259 #define CV2CS(cv) ((cv) >> 8 * sizeof(char)) 260 261 262 /* 263 * Definition of SETCHARSET. 264 * 265 * SETCHARSET represents a set of character sets. This is used to 266 * specify character sets less accepts. 267 * 268 * Although, ISO 2022 can accept any character sets, the output device 269 * cannot represents all. Therefore, we add less ability to specify 270 * character sets that a user want to use. 271 * 272 * SCSASCII is a value to specify ASCII character set. 273 * SCSJISX0201_1976..SCSJISX0213_2004 specify Japanese character sets. 274 * All of these are character sets are defined in Japan. However, 275 * Japanese terminal devices can display only few of them. So, we 276 * decide to give users the ability to specify character sets that 277 * their terminal device can display. 278 * SCSOTHERISO is used to allow all other ISO 2022 character sets. 279 * There are too many character sets in the world. And the number 280 * of them is increasing. Therefore, we also decide to give users 281 * the ability to try all of them. ;-) 282 */ 283 typedef int SETCHARSET; 284 #define SCSASCII 0x0000 285 #define SCSJISX0201_1976 0x0001 286 #define SCSJISC6226_1978 0x0002 287 #define SCSJISX0208_1983 0x0004 288 #define SCSJISX0208_1990 0x0008 289 #define SCSJISX0212_1990 0x0010 290 #define SCSJISX0213_2000 0x0020 291 #define SCSJISX0213_2004 0x0040 292 #define SCSJISX0213_2ND 0x0080 /* 2nd plane of JIS X 0213:2000 and */ 293 /* JIS X 0213:2004 */ 294 #define SCSOTHERISO 0x0100 295 #define SCSUTF8 0x0200 296 #define SCSCP932EX 0x0400 /* Shift_JIS Extended by IBM/NEC/MS */ 297 /* 298 * SCSALLJIS - everything 299 * SCSALLJISTRAD - everything except JIS X 0213 plane 2 and JIS X 0212. 300 * SCSALLSJIS - everything except JIS X 0212 301 */ 302 #define SCSALLJIS (SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\ 303 SCSJISX0208_1990|SCSJISX0213_2000|SCSJISX0213_2004|\ 304 SCSJISX0213_2ND|SCSJISX0212_1990) 305 #define SCSALLJISTRAD (SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983) 306 #define SCSALLSJIS (SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\ 307 SCSJISX0208_1990|SCSJISX0213_2000|SCSJISX0213_2004|\ 308 SCSJISX0213_2ND) 309 #define SCSCP932 (SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\ 310 SCSJISX0208_1990|SCSCP932EX) 311 312 /* 313 * Definition of ENCSET. 314 * 315 * ENCSET represents a set of encoding schemes less accepts. ENCSET is 316 * used as a triplet like { input, inputr, output }. "input" represents 317 * a set of encoding schemes for input stream left plane (0x00..0x7f). 318 * "inputr" represents a set of encoding schemes for input stream right 319 * plane (0x80..0xff). "output" represents an encoding scheme for output 320 * stream. 321 * 322 * ESNONE has to be used exclusively to specify no-data. This is used 323 * as only "inputr" to specify no right plane (0x80..0xff) data. 324 * ESNOCONV has to be used exclusively to specify no-conversion. 325 * ESISO7 and ESISO8 specify ISO style encoding techniques. ESISO7 can 326 * be used as "input" or "output". ESISO8 can be used as "inputr" or 327 * "output". 328 * ESJIS83, ESSJIS, and ESUJIS specify Japanese encoding techniques. 329 * Note: As input, users can use any combination of these values. 330 * However, as output, users need to use only one of them. 331 * Note: If ESJIS83 is used as "output", less output all KANJI 332 * character set using only JIS X 0208-1983 character set (ESC$B) with 333 * a hope that user's terminal device is using glyph of JIS X 0213:2004 334 * plane 1 character set as its default glyph. It is hard to update 335 * terminal device to understand JIS X 0213:2004 completely, but it is 336 * easy to change the glyph. 337 * ESUTF8 specifies encoding technique and character set. This have to 338 * be used exclusively as output. 339 */ 340 typedef int ENCSET; 341 #define ESNONE 0x0000 342 #define ESNOCONV 0x0001 343 #define ESISO7 0x0002 344 #define ESISO8 0x0004 345 #define ESJIS83 0x0008 346 #define ESSJIS 0x0010 347 #define ESUJIS 0x0020 348 #define ESUTF8 0x0040 349 #define ESCP932 0x0080 350 #define ESALLJA (ESISO8|ESUTF8|ESUJIS|ESSJIS) 351 #define ESALLJACP932 (ESISO8|ESUTF8|ESUJIS|ESCP932) 352 353 /* 354 * J_PRIORITY: priority to select either UJIS or SJIS as encoding scheme. 355 */ 356 typedef enum { 357 PUJIS, 358 PSJIS, 359 PUTF8, 360 PNONE 361 } J_PRIORITY; 362 363 /* 364 * Unicode Character Width 365 */ 366 typedef enum { 367 UWIDTH_NONE = 0, 368 UWIDTH_NORMAL = 1, 369 UWIDTH_CJK = 2, 370 UWIDTH_JA = 3, 371 UWIDTH_ALMOST = 4, 372 UWIDTH_ALL = 5, 373 } UWidth; 374 375 /* 376 * A structure used as a return value in multi_parse(). 377 */ 378 typedef struct { 379 char *cbuf; 380 CHARSET *csbuf; 381 int byte; 382 } M_BUFDATA; 383 384 /* 385 * struct multibuf is internal data structure for multi.c. 386 * Defines it name only. 387 */ 388 typedef struct multibuf MULBUF; 389 390 391 /* 392 * in multi.c 393 */ 394 extern int set_planeset (); 395 extern void init_def_scs_es (); 396 extern void init_def_priority (); 397 extern void init_priority (); 398 extern J_PRIORITY get_priority (); 399 extern void set_priority (); 400 extern void set_utfwidth(); 401 extern MULBUF * new_multibuf (); 402 extern void clear_multibuf (); 403 extern void init_multibuf (); 404 extern void multi_start (); 405 extern void multi_parse (); 406 extern void multi_flush (); 407 extern void multi_discard (); 408 extern void set_codesets (); 409 extern char * get_icharset_string (); 410 extern char * outchar(); 411 extern char * outbuf(); 412 extern int mwidth(); 413 extern char * rotate_right_codeset (); 414 extern int strlen_cs(); 415 extern int chlen_cs(); 416 extern char* strdup_cs(); 417 418 /* 419 * in unify.c 420 */ 421 extern void jis78to90(); 422 extern void chconvert_cs(); 423 extern void chunify_cs(); 424 extern int chcmp_cs(); 425 extern int checkKANJI(); 426 extern int chisvalid_cs(); 427