1 /*
2  * Copyright (c) 1997-2005  Kazushi (Jam) Marukawa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice in the documentation and/or other materials provided with
12  *    the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
23  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
24  * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * The design of data structure of jless
30  *
31  * We use char[] byte data and CHARSET[] character set data to represent
32  * multilingual text.  We defined CHARSET following ISO 2022 technique.
33  * All characters represented in ISO 2022 can be stored in less without
34  * any destructive conversion.
35  *
36  * For example, less can read text files using JIS C 6226-1978, JIS X
37  * 0208-1983, and JIS X 0208:1990 character sets and output everything
38  * using their original character set while searching a character encoded
39  * by JIS X 0213:2004.  Inside of less, it buffers all text files using
40  * their original character set, unifies them when matching with the
41  * searching character, and outputs using their original character sets.
42  *
43  * If less needs conversions when it outputs internal data, it converts
44  * them on the fly.
45  *
46  * On the other hand, text using SJIS or UJIS are buffered after
47  * conversion while less is reading input stream.
48  *
49  * In addition, UTF-8 is buffered as UTF-8.  Less converts it to appropriate
50  * character set/sets on the fly. (UTF-8 is notimplemented yet).
51  */
52 
53 /*
54  * Definition of values to specify the character set.
55  * And definitions some well known character sets and a types of set.
56  */
57 typedef unsigned short CHARSET;
58 
59 /*
60  * The structure of CHARSET:
61  *
62  *   151413121110 9 8 7 6 5 4 3 2 1 0
63  *   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64  *   |r|    IRR    |m|n|      F      |
65  *   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
66  *
67  * r: True if it is not the first byte of multi-byte characters.
68  * IRR: Identification of Revisions of Registered character sets (IRR).
69  *      Read ISO-2022 for mode details.  The value of IRR is ranged from
70  *      00/01 to 03/15.  00/00 means no IRR.  IRR (from 00/01 to 03/15)
71  *      is mapped to a code from 04/00 to 07/14 in ISO-2022.
72  * m: True if it is part of multi-byte characters.
73  * n: True if it is one of 96 or 96x96 graphic sets, otherwise it is one
74  *    of 94 or 94x94 graphic sets.
75  * F: Final byte (F).  This select graphi sets of characters.
76  *    The value of F is ranged from 00/00 to 04/14.  Such values are coded
77  *    from 03/00 to 07/14 in ISO-2022.
78  */
79 
80 #define	REST_MASK		0x8000		/* r */
81 #define CSISHEAD(cs)		(!((cs) & REST_MASK))
82 #define CSISREST(cs)		((cs) & REST_MASK)
83 
84 #define IRR_MASK		0x7e00		/* IRR */
85 #define IRR_SHIFT		9
86 #define CS2IRR(cs)		(((cs) & IRR_MASK) >> IRR_SHIFT)
87 #define IRR2CS(irr)		(((irr) << IRR_SHIFT) & IRR_MASK)
88 
89 #define CODE_MASK		0x003f		/* coded IRR in ISO 2022 */
90 #define CODE_DIFF		0x0040
91 #define IRR2CODE(irr)		((((irr) - 1) & CODE_MASK) + CODE_DIFF)
92 #define CODE2IRR(code)		((((code) - CODE_DIFF) & CODE_MASK) + 1)
93 
94 #define TYPE_94_CHARSET		0x0000		/* m & n */
95 #define TYPE_96_CHARSET		0x0080
96 #define TYPE_94N_CHARSET	0x0100
97 #define TYPE_96N_CHARSET	0x0180
98 #define TYPE_MASK		0x0180
99 #define CS2TYPE(cs)		((cs) & TYPE_MASK)
100 #define TYPE2CS(type)		((type) & TYPE_MASK)
101 
102 #define FT_MASK			0x007f		/* F */
103 #define FT_DIFF			0x0030
104 #define CS2FT(cs)		(((cs) & FT_MASK) + FT_DIFF)
105 #define FT2CS(ft)		(((ft) - FT_DIFF) & FT_MASK)
106 
107 /*
108  * Each character sets is represented by IRR, TYPE and FT.
109  */
110 #define CHARSET_MASK		(IRR_MASK | TYPE_MASK | FT_MASK)
111 #define CS2CHARSET(cs)		((cs) & CHARSET_MASK)
112 
113 /*
114  * There is a reserved empty set in every type of charset.  07/14.
115  * So we cannot use (CS2CHARSET(cs) == WRONGCS) to check it.
116  */
117 #define CSISWRONG(cs)		(CS2FT(cs) == '~')
118 
119 /*
120  * List of representative character sets.
121  */
122 #define ASCII			(TYPE_94_CHARSET | FT2CS('B'))
123 #define WRONGCS			(TYPE_94_CHARSET | FT2CS('~'))
124 #define WRONG_ESC		(IRR2CS(1) | TYPE_94_CHARSET | FT2CS('~'))
125 #define WRONGUCS_H		(IRR2CS(2) | TYPE_94N_CHARSET | FT2CS('~'))
126 #define WRONGUCS_T		(IRR2CS(3) | TYPE_94N_CHARSET | FT2CS('~'))
127 #define WRONGUCS_M		(IRR2CS(4) | TYPE_94N_CHARSET | FT2CS('~'))
128 #if ISO
129 #define JISX0201KANA		(TYPE_94_CHARSET | FT2CS('I'))
130 #define JISX0201ROMAN		(TYPE_94_CHARSET | FT2CS('J'))
131 #define LATIN1			(TYPE_96_CHARSET | FT2CS('A'))
132 #define LATIN2			(TYPE_96_CHARSET | FT2CS('B'))
133 #define LATIN3			(TYPE_96_CHARSET | FT2CS('C'))
134 #define LATIN4			(TYPE_96_CHARSET | FT2CS('D'))
135 #define CYRILLIC		(TYPE_96_CHARSET | FT2CS('L'))
136 #define ARABIC			(TYPE_96_CHARSET | FT2CS('G'))
137 #define GREEK			(TYPE_96_CHARSET | FT2CS('F'))
138 #define HEBREW			(TYPE_96_CHARSET | FT2CS('H'))
139 #define LATIN5			(TYPE_96_CHARSET | FT2CS('M'))
140 #define LATIN6			(TYPE_96_CHARSET | FT2CS('V'))
141 #define THAI			(TYPE_96_CHARSET | FT2CS('T'))
142 #define LATIN7			(TYPE_96_CHARSET | FT2CS('Y'))
143 #define LATIN8			(TYPE_96_CHARSET | FT2CS('_'))
144 #define LATIN9			(TYPE_96_CHARSET | FT2CS('b'))
145 #define LATIN10			(TYPE_96_CHARSET | FT2CS('f'))
146 /*
147  * JISX0208_78KANJI means JIS C 6226-1978
148  * JISX0208KANJI means JIS X 0208-1983 (same as JIS C 6226-1983)
149  *   This is similar to JIS C 6226-1978.  Several characters are moved
150  *   or exchanged in code space.  Conversion table is available in unify.c.
151  * JISX0208_90KANJI means JIS X 0208:1990 (same as JIS X 0208-1990)
152  *   This is super set of JIS X 0208-1983.  Two characters are added from
153  *   JIS X 0208-1983.  In addition, this covers JIS X 0208:1997 too.
154  *   They have the same code space.  The difference between them is
155  *   historical description.  JIS X 0208:1997 defines ans describes
156  *   all characters.
157  * JISX0213KANJI1 means JIS X 0213:2000 plane 1
158  *   This is super set of JIS X 0208:1990 and JIS X 0208:1997.  Several
159  *   characters are added.
160  * JISX02132004KANJI1 means JIS X 0213:2004 plane 1
161  *   This is super set of JIS X 0213:2000.  10 characters are added.
162  *   And, glyph of several characters is modified.
163  *
164  * JISX0212KANJISUP means JIS X 0212:1990 (same as JIS X 0212-1990)
165  * JISX0213KANJI2 means JIS X 0213:2000 plane 1
166  * JISX02132004KANJI2 means JIS X 0213:2004 plane 1
167  *
168  * JISX0201KANA means JIS X 0201:1976 right plane (same as JIS X 0201-1976
169  * and JIS C 6220-1976 right plane)
170  * JISX0201ROMAN means JIS X 0201:1976 left plane (same as JIS X 0201-1976
171  * and JIS C 6220-1976 left plane)
172  *   These cover JIS X 0201:1997 too.  They have the same code space.
173  *   The difference between them is historical description.
174  *   JIS X 0201:1997 defines ans describes all characters.
175  */
176 #define JISX0208_78KANJI	(TYPE_94N_CHARSET | FT2CS('@'))
177 #define GB2312			(TYPE_94N_CHARSET | FT2CS('A'))
178 #define JISX0208KANJI		(TYPE_94N_CHARSET | FT2CS('B'))
179 #define JISX0208_90KANJI	(IRR2CS(1) | TYPE_94N_CHARSET | FT2CS('B'))
180 #define KSC5601			(TYPE_94N_CHARSET | FT2CS('C'))
181 #define JISX0212KANJISUP	(TYPE_94N_CHARSET | FT2CS('D'))
182 #define JISX0213KANJI1		(TYPE_94N_CHARSET | FT2CS('O'))
183 #define JISX0213KANJI2		(TYPE_94N_CHARSET | FT2CS('P'))
184 #define JISX02132004KANJI1	(TYPE_94N_CHARSET | FT2CS('Q'))
185 #define JISX02132004KANJI2	(TYPE_94N_CHARSET | FT2CS('P'))
186 
187 #define UTF8Z			(IRR2CS(0) | TYPE_94N_CHARSET | (FT_MASK-2))
188 #define UTF8			(IRR2CS(1) | TYPE_94N_CHARSET | (FT_MASK-2))
189 #define UTF8W			(IRR2CS(2) | TYPE_94N_CHARSET | (FT_MASK-2))
190 #if JAPANESE
191 /*
192  * Special number for Japanese code set.  Only input_set use following with
193  * above definitions.  The 07/15 or 07/14 are not valid for F.  So, we are
194  * using them as indications of special character sets.
195  *
196  * SJIS contains ASCII, JIS X 0201:1976 right plane, and JIS X 0208:1997
197  * UJIS contains ASCII, JIS X 0201:1976, and JIS X 0208:1997
198  * SJIS2000 contains ASCII, JIS X 0201:1976 right plane, and JIS X 0213:2000
199  * UJIS2000 contains ASCII, JIS X 0201:1976, JIS X 0213:2000,
200  * and JIS X 0212:1990
201  * SJIS2004 contains ASCII, JIS X 0201:1976 right plane, and JIS X 0213:2004
202  * UJIS2004 contains ASCII, JIS X 0201:1976, JIS X 0213:2004,
203  * and JIS X 0212:1990
204  */
205 #define SJIS			(IRR2CS(0) | TYPE_94N_CHARSET | FT_MASK)
206 #define SJIS2000		(IRR2CS(1) | TYPE_94N_CHARSET | FT_MASK)
207 #define SJIS2004		(IRR2CS(2) | TYPE_94N_CHARSET | FT_MASK)
208 #define CP932			(IRR2CS(3) | TYPE_94N_CHARSET | FT_MASK)
209 #define UJIS			(IRR2CS(0) | TYPE_94N_CHARSET | (FT_MASK-1))
210 #define UJIS2000		(IRR2CS(1) | TYPE_94N_CHARSET | (FT_MASK-1))
211 #define UJIS2004		(IRR2CS(2) | TYPE_94N_CHARSET | (FT_MASK-1))
212 
213 
214 /*
215  * Make SJIS/UJIS character set from mp.
216  *
217  * SJIS and UJIS are using only fixed number of plane sets.  Therefore,
218  * it is impossible to use JIS X 0208:1990 and JIS X 0213:2004 at the
219  * same time.  SJIS use only one of them.  And, it is declared by
220  * MULBUF->io.right.  This function constructs appropriate SJIS
221  * character set number from it.
222  *
223  * Usage: sjiscs = MAKESUJISCS(mp, SJIS);
224  *        ujiscs = MAKESUJISCS(mp, UJIS);
225  */
226 #define MAKESUJISCS(mp,su) \
227 	((su)| (((mp)->io.right&CJISX0213_2004)?IRR2CS(2):\
228 		(((mp)->io.right&CJISX0213_2000)?IRR2CS(1):0)))
229 #endif
230 #endif
231 
232 /*
233  * List of special characters and character set for it.
234  *
235  *	A terminator of string with character set is represented by
236  *    both a NULCH and a NULLCS.  A padding character in string with
237  *    character set is represented by both a PADCH and a NULLCS.  A
238  *    binary data '\0' and '\1' are represented by both '\0' and a
239  *    WRONGCS, and both '\1' and a WRONGCS respectively.
240  */
241 #define NULCH			('\0')
242 #define PADCH			('\1')
243 #define NULLCS			(ASCII)
244 
245 /*
246  * Macros for easy checking.
247  */
248 #define CSISASCII(cs)		(CS2CHARSET(cs) == ASCII)
249 #define CSISNULLCS(cs)		(CS2CHARSET(cs) == NULLCS)
250 
251 
252 /*
253  * Definition of values to specify the character set and character.
254  */
255 typedef int CHARVAL;
256 
257 #define MAKECV(ch, cs)		(((cs) << 8 * sizeof(char)) | ch)
258 #define CV2CH(cv)		((cv) & ((1 << 8 * sizeof(char)) - 1))
259 #define CV2CS(cv)		((cv) >> 8 * sizeof(char))
260 
261 
262 /*
263  * Definition of SETCHARSET.
264  *
265  * SETCHARSET represents a set of character sets.  This is used to
266  * specify character sets less accepts.
267  *
268  * Although, ISO 2022 can accept any character sets, the output device
269  * cannot represents all.  Therefore, we add less ability to specify
270  * character sets that a user want to use.
271  *
272  * SCSASCII is a value to specify ASCII character set.
273  * SCSJISX0201_1976..SCSJISX0213_2004 specify Japanese character sets.
274  *   All of these are character sets are defined in Japan.  However,
275  *   Japanese terminal devices can display only few of them.  So, we
276  *   decide to give users the ability to specify character sets that
277  *   their terminal device can display.
278  * SCSOTHERISO is used to allow all other ISO 2022 character sets.
279  *   There are too many character sets in the world.  And the number
280  *   of them is increasing.  Therefore, we also decide to give users
281  *   the ability to try all of them.  ;-)
282  */
283 typedef int SETCHARSET;
284 #define SCSASCII		0x0000
285 #define SCSJISX0201_1976	0x0001
286 #define SCSJISC6226_1978	0x0002
287 #define SCSJISX0208_1983	0x0004
288 #define SCSJISX0208_1990	0x0008
289 #define SCSJISX0212_1990	0x0010
290 #define SCSJISX0213_2000	0x0020
291 #define SCSJISX0213_2004	0x0040
292 #define SCSJISX0213_2ND		0x0080	/* 2nd plane of JIS X 0213:2000 and */
293 					/* JIS X 0213:2004 */
294 #define SCSOTHERISO		0x0100
295 #define SCSUTF8			0x0200
296 #define SCSCP932EX		0x0400  /* Shift_JIS Extended by IBM/NEC/MS */
297 /*
298  * SCSALLJIS - everything
299  * SCSALLJISTRAD - everything except JIS X 0213 plane 2 and JIS X 0212.
300  * SCSALLSJIS - everything except JIS X 0212
301  */
302 #define SCSALLJIS	(SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\
303 			 SCSJISX0208_1990|SCSJISX0213_2000|SCSJISX0213_2004|\
304 			 SCSJISX0213_2ND|SCSJISX0212_1990)
305 #define SCSALLJISTRAD	(SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983)
306 #define SCSALLSJIS	(SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\
307 			 SCSJISX0208_1990|SCSJISX0213_2000|SCSJISX0213_2004|\
308 			 SCSJISX0213_2ND)
309 #define SCSCP932	(SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\
310 			 SCSJISX0208_1990|SCSCP932EX)
311 
312 /*
313  * Definition of ENCSET.
314  *
315  * ENCSET represents a set of encoding schemes less accepts.  ENCSET is
316  * used as a triplet like { input, inputr, output }.  "input" represents
317  * a set of encoding schemes for input stream left plane (0x00..0x7f).
318  * "inputr" represents a set of encoding schemes for input stream right
319  * plane (0x80..0xff).  "output" represents an encoding scheme for output
320  * stream.
321  *
322  * ESNONE has to be used exclusively to specify no-data.  This is used
323  *   as only "inputr" to specify no right plane (0x80..0xff) data.
324  * ESNOCONV has to be used exclusively to specify no-conversion.
325  * ESISO7 and ESISO8 specify ISO style encoding techniques.  ESISO7 can
326  *   be used as "input" or "output".  ESISO8 can be used as "inputr" or
327  *   "output".
328  * ESJIS83, ESSJIS, and ESUJIS specify Japanese encoding techniques.
329  *   Note: As input, users can use any combination of these values.
330  *   However, as output, users need to use only one of them.
331  *   Note: If ESJIS83 is used as "output", less output all KANJI
332  *   character set using only JIS X 0208-1983 character set (ESC$B) with
333  *   a hope that user's terminal device is using glyph of JIS X 0213:2004
334  *   plane 1 character set as its default glyph.  It is hard to update
335  *   terminal device to understand JIS X 0213:2004 completely, but it is
336  *   easy to change the glyph.
337  * ESUTF8 specifies encoding technique and character set.  This have to
338  *   be used exclusively as output.
339  */
340 typedef int ENCSET;
341 #define ESNONE		0x0000
342 #define ESNOCONV	0x0001
343 #define ESISO7		0x0002
344 #define ESISO8		0x0004
345 #define ESJIS83		0x0008
346 #define ESSJIS		0x0010
347 #define ESUJIS		0x0020
348 #define ESUTF8		0x0040
349 #define ESCP932		0x0080
350 #define ESALLJA		(ESISO8|ESUTF8|ESUJIS|ESSJIS)
351 #define ESALLJACP932	(ESISO8|ESUTF8|ESUJIS|ESCP932)
352 
353 /*
354  * J_PRIORITY: priority to select either UJIS or SJIS as encoding scheme.
355  */
356 typedef enum {
357     PUJIS,
358     PSJIS,
359     PUTF8,
360     PNONE
361 } J_PRIORITY;
362 
363 /*
364  * Unicode Character Width
365  */
366 typedef enum {
367     UWIDTH_NONE   = 0,
368     UWIDTH_NORMAL = 1,
369     UWIDTH_CJK    = 2,
370     UWIDTH_JA     = 3,
371     UWIDTH_ALMOST = 4,
372     UWIDTH_ALL    = 5,
373 } UWidth;
374 
375 /*
376  * A structure used as a return value in multi_parse().
377  */
378 typedef struct {
379 	char *cbuf;
380 	CHARSET *csbuf;
381 	int byte;
382 } M_BUFDATA;
383 
384 /*
385  * struct multibuf is internal data structure for multi.c.
386  * Defines it name only.
387  */
388 typedef struct multibuf MULBUF;
389 
390 
391 /*
392  * in multi.c
393  */
394 extern int set_planeset ();
395 extern void init_def_scs_es ();
396 extern void init_def_priority ();
397 extern void init_priority ();
398 extern J_PRIORITY get_priority ();
399 extern void set_priority ();
400 extern void set_utfwidth();
401 extern MULBUF * new_multibuf ();
402 extern void clear_multibuf ();
403 extern void init_multibuf ();
404 extern void multi_start ();
405 extern void multi_parse ();
406 extern void multi_flush ();
407 extern void multi_discard ();
408 extern void set_codesets ();
409 extern char * get_icharset_string ();
410 extern char * outchar();
411 extern char * outbuf();
412 extern int mwidth();
413 extern char * rotate_right_codeset ();
414 extern int strlen_cs();
415 extern int chlen_cs();
416 extern char* strdup_cs();
417 
418 /*
419  * in unify.c
420  */
421 extern void jis78to90();
422 extern void chconvert_cs();
423 extern void chunify_cs();
424 extern int chcmp_cs();
425 extern int checkKANJI();
426 extern int chisvalid_cs();
427