1 /*
2  * char-sjis.h
3  *
4  *   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   3. Neither the name of the authors nor the names of its contributors
18  *      may be used to endorse or promote products derived from this
19  *      software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef SCM_CHAR_ENCODING_BODY
35 /*===============================================================
36  * Header part
37  */
38 
39 /* The name of the encoding.  Scheme procedure
40  * gauche-character-encoding returns a symbol with this name.
41  */
42 #define SCM_CHAR_ENCODING_NAME "sjis"
43 
44 /* Given first byte of the multibyte character, returns # of
45  * bytes that follows, i.e. if the byte consists a single-byte
46  * character, it returns 0; if the byte is the first byte of
47  * two-byte character, it returns 1.   It may return -1 if
48  * the given byte can't be a valid first byte of multibyte characters.
49  */
50 #define SCM_CHAR_NFOLLOWS(byte)                   \
51     (((unsigned char)(byte)) < 0x81? 0 :          \
52      (((unsigned char)(byte)) < 0xa0? 1 :         \
53       (((unsigned char)(byte)) < 0xe0? 0 : 1)))
54 
55 /* Given wide character CH, returns # of bytes used when CH is
56  * encoded in multibyte string.
57  */
58 #define SCM_CHAR_NBYTES(ch) (((ch) > 0x0ff) ? 2 : 1)
59 
60 /* Maximun # of multibyte character */
61 #define SCM_CHAR_MAX_BYTES     2
62 
63 /* From a multibyte string pointed by const char *cp, extract a character
64  * and store it in ScmChar ch.  If cp doesn't point to valid multibyte
65  * character, store SCM_CHAR_INVALID to ch.  cp is not modified.
66  */
67 #define SCM_CHAR_GET(cp, ch)                                                \
68     do {                                                                    \
69         (ch) = (unsigned char)*(cp);                                        \
70         if ((unsigned char)(ch) >= 0x80) {                                  \
71           if ((unsigned char)(ch) < 0xa0 || (unsigned char)(ch) >= 0xe0) {  \
72              (ch) = (((unsigned char)(ch)) << 8) + (unsigned char)*(cp+1);  \
73           }                                                                 \
74         }                                                                   \
75     } while (0)
76 
77 /* Convert a character CH to multibyte form and put it to the buffer
78  * starting from char *cp.  You can assume the buffer has enough length
79  * to contain the multibyte char.   cp is not modified.
80  */
81 #define SCM_CHAR_PUT(cp, ch)                    \
82     do {                                        \
83         if ((ch) > 0xff) {                      \
84             (cp)[0] = (ch >> 8) & 0xff;         \
85             (cp)[1] = ch & 0xff;                \
86         } else {                                \
87             (cp)[0] = ch & 0xff;                \
88         }                                       \
89     } while (0)
90 
91 /* const char *cp points to a multibyte string.  Set const char *result
92  * to point to the previous character of the one cp points to.
93  * const char *start points to the beginning of the buffer.
94  * result is set to NULL if there's no valid multibyte char found
95  * just before cp.   cp and start is not modified.
96  */
97 #define SCM_CHAR_BACKWARD(cp, start, result)                    \
98     do {                                                        \
99         (result) = (cp);                                        \
100         if ((result) == (start)) (result) = NULL;               \
101         else if ((result) == (start) + 1) (result) = (start);   \
102         else if (SCM_CHAR_NFOLLOWS(*((result)-2)) == 1) {       \
103              (result) -= 2;                                     \
104         } else {                                                \
105              (result) -= 1;                                     \
106         }                                                       \
107     } while (0)
108 
109 /* C is an ScmChar > 0x80.  Returns true if C is a whitespace character. */
110 #define SCM_CHAR_EXTRA_WHITESPACE(c) \
111     (((c) == 0x8140)                       /* zenkaku space */ \
112      || ((c) == 0x8541))                   /* NBSP */
113 
114 /* Like SCM_CHAR_EXTRA_WHITESPACE, but excludes Zl and Zp.
115    See R6RS on the intraline whitespaces. */
116 #define SCM_CHAR_EXTRA_WHITESPACE_INTRALINE(c) SCM_CHAR_EXTRA_WHITESPACE(c)
117 
118 #else  /* !SCM_CHAR_ENCODING_BODY */
119 /*==================================================================
120  * This part is included in char.c
121  */
122 
123 /* Array of character encoding names, recognizable by iconv, that are
124    compatible with this native encoding. */
125 static const char *supportedCharacterEncodings[] = {
126     "SHIFT_JIS",
127     "SHIFT-JIS",
128     "SHIFT_JISX0213",
129     "SHIFT-JISX0213",
130     "SJIS",
131     NULL
132 };
133 
134 /*
135  * Lookup character category.  The tables are in char_attr.c, automatically
136  * generated by gen-unicode.scm.
137  */
Scm__LookupCharCategory(ScmChar ch)138 static inline unsigned char Scm__LookupCharCategory(ScmChar ch)
139 {
140     if (ch == SCM_CHAR_INVALID) return SCM_CHAR_CATEGORY_Cn;
141     if (ch < 0x80) return sjis_general_category_00[ch];
142     else if (ch < 0xa0)   return SCM_CHAR_CATEGORY_Cn;
143     else if (ch < 0xe0)   return sjis_general_category_a0[ch-0xa0];
144     else if (ch < 0x8040) return SCM_CHAR_CATEGORY_Cn;
145     else if (ch < 0x9ffd) {
146         unsigned char b0 = (ch >> 8) - 0x80;
147         unsigned char b1 = (ch & 0xff) - 0x40;
148         if (b0 >= (0xa0 - 0x80)) return SCM_CHAR_CATEGORY_Cn;
149         if (b1 >= (0xfd - 0x40)) return SCM_CHAR_CATEGORY_Cn;
150         SCM_ASSERT(0 <= b0 && b0 < (0xa0 - 0x80));
151         SCM_ASSERT(0 <= b1 && b1 < (0xfd - 0x40));
152         return sjis_general_category_8000[b0 * (0xfd-0x40) + b1];
153     } else if (ch < 0xe000) {
154         return SCM_CHAR_CATEGORY_Cn;
155     } else if (ch < 0xfffd) {
156         unsigned char b0 = (ch >> 8) - 0xe0;
157         unsigned char b1 = (ch & 0xff) - 0x40;
158         if (b0 >= (0x100 - 0xe0)) return SCM_CHAR_CATEGORY_Cn;
159         if (b1 >= (0xfd - 0x40))  return SCM_CHAR_CATEGORY_Cn;
160         return sjis_general_category_e000[b0 * (0xfd-0x40) + b1];
161     } else return SCM_CHAR_CATEGORY_Cn;
162 }
163 
164 /*
165  * Returns true if the character isn't supported in Unicode.
166  */
Scm__CharInUnicodeP(ScmChar ch)167 static int Scm__CharInUnicodeP(ScmChar ch)
168 {
169     if (ch < 0x82f5 || ch > 0x8686) return TRUE;
170     if (ch < 0x8600) {
171         if (ch <= 0x82f9 || (ch >= 0x8397 && ch <= 0x839e)
172             || ch == 0x83f6) return FALSE;
173         else return TRUE;
174     } else {
175         if (ch == 0x8663 || (ch >= 0x8667 && ch <= 0x866e)
176             || ch == 0x8685 || ch == 0x8686) return FALSE;
177         else return TRUE;
178     }
179 }
180 
181 #endif /* !SCM_CHAR_ENCODING_BODY */
182