1 /*
2 * char-sjis.h
3 *
4 * Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifndef SCM_CHAR_ENCODING_BODY
35 /*===============================================================
36 * Header part
37 */
38
39 /* The name of the encoding. Scheme procedure
40 * gauche-character-encoding returns a symbol with this name.
41 */
42 #define SCM_CHAR_ENCODING_NAME "sjis"
43
44 /* Given first byte of the multibyte character, returns # of
45 * bytes that follows, i.e. if the byte consists a single-byte
46 * character, it returns 0; if the byte is the first byte of
47 * two-byte character, it returns 1. It may return -1 if
48 * the given byte can't be a valid first byte of multibyte characters.
49 */
50 #define SCM_CHAR_NFOLLOWS(byte) \
51 (((unsigned char)(byte)) < 0x81? 0 : \
52 (((unsigned char)(byte)) < 0xa0? 1 : \
53 (((unsigned char)(byte)) < 0xe0? 0 : 1)))
54
55 /* Given wide character CH, returns # of bytes used when CH is
56 * encoded in multibyte string.
57 */
58 #define SCM_CHAR_NBYTES(ch) (((ch) > 0x0ff) ? 2 : 1)
59
60 /* Maximun # of multibyte character */
61 #define SCM_CHAR_MAX_BYTES 2
62
63 /* From a multibyte string pointed by const char *cp, extract a character
64 * and store it in ScmChar ch. If cp doesn't point to valid multibyte
65 * character, store SCM_CHAR_INVALID to ch. cp is not modified.
66 */
67 #define SCM_CHAR_GET(cp, ch) \
68 do { \
69 (ch) = (unsigned char)*(cp); \
70 if ((unsigned char)(ch) >= 0x80) { \
71 if ((unsigned char)(ch) < 0xa0 || (unsigned char)(ch) >= 0xe0) { \
72 (ch) = (((unsigned char)(ch)) << 8) + (unsigned char)*(cp+1); \
73 } \
74 } \
75 } while (0)
76
77 /* Convert a character CH to multibyte form and put it to the buffer
78 * starting from char *cp. You can assume the buffer has enough length
79 * to contain the multibyte char. cp is not modified.
80 */
81 #define SCM_CHAR_PUT(cp, ch) \
82 do { \
83 if ((ch) > 0xff) { \
84 (cp)[0] = (ch >> 8) & 0xff; \
85 (cp)[1] = ch & 0xff; \
86 } else { \
87 (cp)[0] = ch & 0xff; \
88 } \
89 } while (0)
90
91 /* const char *cp points to a multibyte string. Set const char *result
92 * to point to the previous character of the one cp points to.
93 * const char *start points to the beginning of the buffer.
94 * result is set to NULL if there's no valid multibyte char found
95 * just before cp. cp and start is not modified.
96 */
97 #define SCM_CHAR_BACKWARD(cp, start, result) \
98 do { \
99 (result) = (cp); \
100 if ((result) == (start)) (result) = NULL; \
101 else if ((result) == (start) + 1) (result) = (start); \
102 else if (SCM_CHAR_NFOLLOWS(*((result)-2)) == 1) { \
103 (result) -= 2; \
104 } else { \
105 (result) -= 1; \
106 } \
107 } while (0)
108
109 /* C is an ScmChar > 0x80. Returns true if C is a whitespace character. */
110 #define SCM_CHAR_EXTRA_WHITESPACE(c) \
111 (((c) == 0x8140) /* zenkaku space */ \
112 || ((c) == 0x8541)) /* NBSP */
113
114 /* Like SCM_CHAR_EXTRA_WHITESPACE, but excludes Zl and Zp.
115 See R6RS on the intraline whitespaces. */
116 #define SCM_CHAR_EXTRA_WHITESPACE_INTRALINE(c) SCM_CHAR_EXTRA_WHITESPACE(c)
117
118 #else /* !SCM_CHAR_ENCODING_BODY */
119 /*==================================================================
120 * This part is included in char.c
121 */
122
123 /* Array of character encoding names, recognizable by iconv, that are
124 compatible with this native encoding. */
125 static const char *supportedCharacterEncodings[] = {
126 "SHIFT_JIS",
127 "SHIFT-JIS",
128 "SHIFT_JISX0213",
129 "SHIFT-JISX0213",
130 "SJIS",
131 NULL
132 };
133
134 /*
135 * Lookup character category. The tables are in char_attr.c, automatically
136 * generated by gen-unicode.scm.
137 */
Scm__LookupCharCategory(ScmChar ch)138 static inline unsigned char Scm__LookupCharCategory(ScmChar ch)
139 {
140 if (ch == SCM_CHAR_INVALID) return SCM_CHAR_CATEGORY_Cn;
141 if (ch < 0x80) return sjis_general_category_00[ch];
142 else if (ch < 0xa0) return SCM_CHAR_CATEGORY_Cn;
143 else if (ch < 0xe0) return sjis_general_category_a0[ch-0xa0];
144 else if (ch < 0x8040) return SCM_CHAR_CATEGORY_Cn;
145 else if (ch < 0x9ffd) {
146 unsigned char b0 = (ch >> 8) - 0x80;
147 unsigned char b1 = (ch & 0xff) - 0x40;
148 if (b0 >= (0xa0 - 0x80)) return SCM_CHAR_CATEGORY_Cn;
149 if (b1 >= (0xfd - 0x40)) return SCM_CHAR_CATEGORY_Cn;
150 SCM_ASSERT(0 <= b0 && b0 < (0xa0 - 0x80));
151 SCM_ASSERT(0 <= b1 && b1 < (0xfd - 0x40));
152 return sjis_general_category_8000[b0 * (0xfd-0x40) + b1];
153 } else if (ch < 0xe000) {
154 return SCM_CHAR_CATEGORY_Cn;
155 } else if (ch < 0xfffd) {
156 unsigned char b0 = (ch >> 8) - 0xe0;
157 unsigned char b1 = (ch & 0xff) - 0x40;
158 if (b0 >= (0x100 - 0xe0)) return SCM_CHAR_CATEGORY_Cn;
159 if (b1 >= (0xfd - 0x40)) return SCM_CHAR_CATEGORY_Cn;
160 return sjis_general_category_e000[b0 * (0xfd-0x40) + b1];
161 } else return SCM_CHAR_CATEGORY_Cn;
162 }
163
164 /*
165 * Returns true if the character isn't supported in Unicode.
166 */
Scm__CharInUnicodeP(ScmChar ch)167 static int Scm__CharInUnicodeP(ScmChar ch)
168 {
169 if (ch < 0x82f5 || ch > 0x8686) return TRUE;
170 if (ch < 0x8600) {
171 if (ch <= 0x82f9 || (ch >= 0x8397 && ch <= 0x839e)
172 || ch == 0x83f6) return FALSE;
173 else return TRUE;
174 } else {
175 if (ch == 0x8663 || (ch >= 0x8667 && ch <= 0x866e)
176 || ch == 0x8685 || ch == 0x8686) return FALSE;
177 else return TRUE;
178 }
179 }
180
181 #endif /* !SCM_CHAR_ENCODING_BODY */
182