1 /*
2 * Copyright 2016 Nu-book Inc.
3 * Copyright 2016 ZXing authors
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 #include "CharacterSetECI.h"
19 #include "TextDecoder.h"
20
21 #include <cctype>
22 #include <map>
23 #include <utility>
24 #include <algorithm>
25
26 namespace ZXing::CharacterSetECI {
27
28 static const std::map<int, CharacterSet> ECI_VALUE_TO_CHARSET = {
29 {0, CharacterSet::Cp437}, // Obsolete
30 {1, CharacterSet::ISO8859_1}, // Obsolete
31 {2, CharacterSet::Cp437}, // Obsolete but still used by PDF417 Macro fields (ISO/IEC 15438:2015 Annex H.2.3)
32 {3, CharacterSet::ISO8859_1},
33 {4, CharacterSet::ISO8859_2},
34 {5, CharacterSet::ISO8859_3},
35 {6, CharacterSet::ISO8859_4},
36 {7, CharacterSet::ISO8859_5},
37 {8, CharacterSet::ISO8859_6},
38 {9, CharacterSet::ISO8859_7},
39 {10, CharacterSet::ISO8859_8},
40 {11, CharacterSet::ISO8859_9},
41 {12, CharacterSet::ISO8859_10},
42 {13, CharacterSet::ISO8859_11},
43 {15, CharacterSet::ISO8859_13},
44 {16, CharacterSet::ISO8859_14},
45 {17, CharacterSet::ISO8859_15},
46 {18, CharacterSet::ISO8859_16},
47 {20, CharacterSet::Shift_JIS},
48 {21, CharacterSet::Cp1250},
49 {22, CharacterSet::Cp1251},
50 {23, CharacterSet::Cp1252},
51 {24, CharacterSet::Cp1256},
52 {25, CharacterSet::UnicodeBig},
53 {26, CharacterSet::UTF8},
54 {27, CharacterSet::ASCII},
55 {28, CharacterSet::Big5},
56 {29, CharacterSet::GB18030},
57 {30, CharacterSet::EUC_KR},
58 {170, CharacterSet::ASCII},
59 {899, CharacterSet::BINARY},
60 };
61
62 struct CompareNoCase {
operator ()ZXing::CharacterSetECI::CompareNoCase63 bool operator ()(const char* a, const char* b) const {
64 while (*a != '\0' && *b != '\0') {
65 auto ca = std::tolower(*a++);
66 auto cb = std::tolower(*b++);
67 if (ca < cb) {
68 return true;
69 }
70 else if (ca > cb) {
71 return false;
72 }
73 }
74 return *a == '\0' && *b != '\0';
75 }
76 };
77
78 static const std::map<const char *, CharacterSet, CompareNoCase> ECI_NAME_TO_CHARSET = {
79 {"Cp437", CharacterSet::Cp437},
80 {"ISO8859_1", CharacterSet::ISO8859_1},
81 {"ISO-8859-1", CharacterSet::ISO8859_1},
82 {"ISO8859_2", CharacterSet::ISO8859_2},
83 {"ISO-8859-2", CharacterSet::ISO8859_2},
84 {"ISO8859_3", CharacterSet::ISO8859_3},
85 {"ISO-8859-3", CharacterSet::ISO8859_3},
86 {"ISO8859_4", CharacterSet::ISO8859_4},
87 {"ISO-8859-4", CharacterSet::ISO8859_4},
88 {"ISO8859_5", CharacterSet::ISO8859_5},
89 {"ISO-8859-5", CharacterSet::ISO8859_5},
90 {"ISO8859_6", CharacterSet::ISO8859_6},
91 {"ISO-8859-6", CharacterSet::ISO8859_6},
92 {"ISO8859_7", CharacterSet::ISO8859_7},
93 {"ISO-8859-7", CharacterSet::ISO8859_7},
94 {"ISO8859_8", CharacterSet::ISO8859_8},
95 {"ISO-8859-8", CharacterSet::ISO8859_8},
96 {"ISO8859_9", CharacterSet::ISO8859_9},
97 {"ISO-8859-9", CharacterSet::ISO8859_9},
98 {"ISO8859_10", CharacterSet::ISO8859_10},
99 {"ISO-8859-10", CharacterSet::ISO8859_10},
100 {"ISO8859_11", CharacterSet::ISO8859_11},
101 {"ISO-8859-11", CharacterSet::ISO8859_11},
102 {"ISO8859_13", CharacterSet::ISO8859_13},
103 {"ISO-8859-13", CharacterSet::ISO8859_13},
104 {"ISO8859_14", CharacterSet::ISO8859_14},
105 {"ISO-8859-14", CharacterSet::ISO8859_14},
106 {"ISO8859_15", CharacterSet::ISO8859_15},
107 {"ISO-8859-15", CharacterSet::ISO8859_15},
108 {"ISO8859_16", CharacterSet::ISO8859_16},
109 {"ISO-8859-16", CharacterSet::ISO8859_16},
110 {"SJIS", CharacterSet::Shift_JIS},
111 {"Shift_JIS", CharacterSet::Shift_JIS},
112 {"Cp1250", CharacterSet::Cp1250},
113 {"windows-1250",CharacterSet::Cp1250},
114 {"Cp1251", CharacterSet::Cp1251},
115 {"windows-1251",CharacterSet::Cp1251},
116 {"Cp1252", CharacterSet::Cp1252},
117 {"windows-1252",CharacterSet::Cp1252},
118 {"Cp1256", CharacterSet::Cp1256},
119 {"windows-1256",CharacterSet::Cp1256},
120 {"UnicodeBigUnmarked", CharacterSet::UnicodeBig},
121 {"UTF-16BE", CharacterSet::UnicodeBig},
122 {"UnicodeBig", CharacterSet::UnicodeBig},
123 {"UTF8", CharacterSet::UTF8},
124 {"UTF-8", CharacterSet::UTF8},
125 {"ASCII", CharacterSet::ASCII},
126 {"US-ASCII", CharacterSet::ASCII},
127 {"Big5", CharacterSet::Big5},
128 {"GB2312", CharacterSet::GB2312},
129 {"GB18030", CharacterSet::GB18030},
130 {"EUC_CN", CharacterSet::GB18030},
131 {"EUC-CN", CharacterSet::GB18030},
132 {"GBK", CharacterSet::GB18030},
133 {"EUC_KR", CharacterSet::EUC_KR},
134 {"EUC-KR", CharacterSet::EUC_KR},
135 {"BINARY", CharacterSet::BINARY},
136 };
137
CharsetFromValue(int value)138 CharacterSet CharsetFromValue(int value)
139 {
140 auto it = ECI_VALUE_TO_CHARSET.find(value);
141 if (it != ECI_VALUE_TO_CHARSET.end()) {
142 return it->second;
143 }
144 return CharacterSet::Unknown;
145 }
146
ValueForCharset(CharacterSet charset)147 int ValueForCharset(CharacterSet charset)
148 {
149 // Special case ISO8859_1 to avoid obsolete ECI 1
150 if (charset == CharacterSet::ISO8859_1) {
151 return 3;
152 }
153 for (auto& [key, value] : ECI_VALUE_TO_CHARSET) {
154 if (value == charset) {
155 return key;
156 }
157 }
158 return -1;
159 }
160
CharsetFromName(const char * name)161 CharacterSet CharsetFromName(const char* name)
162 {
163 auto it = ECI_NAME_TO_CHARSET.find(name);
164 if (it != ECI_NAME_TO_CHARSET.end()) {
165 return it->second;
166 }
167 return CharacterSet::Unknown;
168 }
169
InitEncoding(const std::string & name,CharacterSet encodingDefault)170 CharacterSet InitEncoding(const std::string& name, CharacterSet encodingDefault)
171 {
172 if (!name.empty()) {
173 auto encodingInit = CharacterSetECI::CharsetFromName(name.c_str());
174 if (encodingInit != CharacterSet::Unknown) {
175 encodingDefault = encodingInit;
176 }
177 }
178
179 return encodingDefault;
180 }
181
OnChangeAppendReset(const int eci,std::wstring & encoded,std::string & data,CharacterSet encoding)182 CharacterSet OnChangeAppendReset(const int eci, std::wstring& encoded, std::string& data, CharacterSet encoding)
183 {
184 // Character set ECIs only
185 if (eci >= 0 && eci <= 899) {
186 auto encodingNew = CharacterSetECI::CharsetFromValue(eci);
187 if (encodingNew != CharacterSet::Unknown && encodingNew != encoding) {
188 // Encode data so far in current encoding and reset
189 TextDecoder::Append(encoded, reinterpret_cast<const uint8_t*>(data.data()), data.size(), encoding);
190 data.clear();
191 encoding = encodingNew;
192 }
193 }
194
195 return encoding;
196 }
197
198 } // namespace ZXing::CharacterSetECI
199