1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "gui/base/encoding_util.h"
31 
32 #include "base/port.h"
33 #include "base/string_piece.h"
34 #include "base/util.h"
35 
36 namespace mozc {
37 namespace {
38 
39 #include "gui/base/sjis_to_ucs2_table.h"
40 
41 // Each character of SJIS is encoded in one or two bytes.
42 //
43 // For first byte, there are 4 valid ranges (closed intervals):
44 //   * FirstByteRange1: [0x00, 0x80]
45 //   * FirstByteRange2: [0x81, 0x9F]
46 //   * FirstByteRange3: [0xA1, 0xDF]
47 //   * FirstByteRange4: [0xE0, 0xFF]
48 // Ranges 2 and 4 are for two bytes encoding, so one more byte is needed to
49 // decode a character.
50 //
51 // For second byte, there are 2 valid ranges (closed intervals):
52 //   * SecondByteRange1: [0x40, 0x7E]
53 //   * SecondByteRange2: [0x80, 0xFF]
54 // Two byte characters are decoded using the conversion table defined in
55 // sjis_to_ucs2_table.h.
IsInFirstByteRange1(uint8_t byte)56 inline bool IsInFirstByteRange1(uint8_t byte) {
57   return byte <= 0x80;
58 }
59 
IsInFirstByteRange2(uint8_t byte)60 inline bool IsInFirstByteRange2(uint8_t byte) {
61   return 0x81 <= byte && byte <= 0x9F;
62 }
63 
IsInFirstByteRange3(uint8_t byte)64 inline bool IsInFirstByteRange3(uint8_t byte) {
65   return 0xA1 <= byte && byte <= 0xDF;
66 }
67 
IsInFirstByteRange4(uint8_t byte)68 inline bool IsInFirstByteRange4(uint8_t byte) {
69   return 0xE0 <= byte;
70 }
71 
IsInSecondByteRange1(uint8_t byte)72 inline bool IsInSecondByteRange1(uint8_t byte) {
73   return 0x40 <= byte && byte <= 0x7E;
74 }
75 
IsInSecondByteRange2(uint8_t byte)76 inline bool IsInSecondByteRange2(uint8_t byte) {
77   return 0x80 <= byte;
78 }
79 
ComputeIndex(uint8_t first,uint8_t second)80 size_t ComputeIndex(uint8_t first, uint8_t second) {
81   size_t first_index = 0;
82   if (IsInFirstByteRange2(first)) {
83     // first_index = "offset of first in FirstByteRange2".
84     first_index = first - 0x81;
85   } else if (IsInFirstByteRange4(first)) {
86     // first_index = "offset of first in FirstByteRange4" +
87     //               length(FirstByteRange2)
88     first_index = (first - 0xE0) + (0x9F - 0x81 + 1);
89   }
90 
91   size_t second_index = 0;
92   if (IsInSecondByteRange1(second)) {
93     // second_index = "offset of second in SecondByteRange1";
94     second_index = second - 0x40;
95   } else if (IsInSecondByteRange2(second)) {
96     // second_index = "offset of second in SecondByteRange2" +
97     //                length(SecondByteRange1)
98     second_index = (second - 0x80) + (0x7E - 0x40 + 1);
99   }
100 
101   // width = length(SecondByteRange1) + length(SecondByteRange2)
102   const size_t width = (0x7E - 0x40 + 1) + (0xFF - 0x80 + 1);
103   return first_index * width + second_index;
104 }
105 
SJISToUTF8Internal(StringPiece input,string * output)106 bool SJISToUTF8Internal(StringPiece input, string* output) {
107   bool expect_first_byte = true;
108   uint8_t first_byte = 0;
109   for (const char c : input) {
110     const uint8_t byte = static_cast<uint8_t>(c);
111 
112     if (expect_first_byte) {
113       if (IsInFirstByteRange1(byte)) {
114         Util::UCS4ToUTF8Append(byte, output);
115       } else if (IsInFirstByteRange3(byte)) {
116         Util::UCS4ToUTF8Append(byte + 0xFEC0, output);
117       } else if (IsInFirstByteRange2(byte) || IsInFirstByteRange4(byte)) {
118         first_byte = byte;
119         expect_first_byte = false;
120       } else {
121         return false;  // Invalid first byte.
122       }
123       continue;
124     }
125 
126     if (!IsInSecondByteRange1(byte) && !IsInSecondByteRange2(byte)) {
127       return false;
128     }
129     const size_t index = ComputeIndex(first_byte, byte);
130     if (index >= sizeof(kSJISToUCS2Table)) {
131       return false;
132     }
133     const uint16_t ucs2 = kSJISToUCS2Table[index];
134     if (ucs2 == 0) {
135       return false;
136     }
137     Util::UCS4ToUTF8Append(ucs2, output);
138     expect_first_byte = true;
139   }
140   return expect_first_byte;
141 }
142 
143 }   // namespace
144 
SJISToUTF8(const string & input,string * output)145 void EncodingUtil::SJISToUTF8(const string &input, string *output) {
146   output->clear();
147   if (!SJISToUTF8Internal(input, output)) {
148     output->clear();
149   }
150 }
151 
152 }  // namespace mozc
153