1 /*
2 * Copyright 2016 Nu-book Inc.
3 * Copyright 2016 ZXing authors
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 
18 #include "CharacterSetECI.h"
19 #include "TextDecoder.h"
20 
21 #include <cctype>
22 #include <map>
23 #include <utility>
24 #include <algorithm>
25 
26 namespace ZXing::CharacterSetECI {
27 
28 static const std::map<int, CharacterSet> ECI_VALUE_TO_CHARSET = {
29 	{0,  CharacterSet::Cp437}, // Obsolete
30 	{1,  CharacterSet::ISO8859_1}, // Obsolete
31 	{2,  CharacterSet::Cp437}, // Obsolete but still used by PDF417 Macro fields (ISO/IEC 15438:2015 Annex H.2.3)
32 	{3,  CharacterSet::ISO8859_1},
33 	{4,  CharacterSet::ISO8859_2},
34 	{5,  CharacterSet::ISO8859_3},
35 	{6,  CharacterSet::ISO8859_4},
36 	{7,  CharacterSet::ISO8859_5},
37 	{8,  CharacterSet::ISO8859_6},
38 	{9,  CharacterSet::ISO8859_7},
39 	{10, CharacterSet::ISO8859_8},
40 	{11, CharacterSet::ISO8859_9},
41 	{12, CharacterSet::ISO8859_10},
42 	{13, CharacterSet::ISO8859_11},
43 	{15, CharacterSet::ISO8859_13},
44 	{16, CharacterSet::ISO8859_14},
45 	{17, CharacterSet::ISO8859_15},
46 	{18, CharacterSet::ISO8859_16},
47 	{20, CharacterSet::Shift_JIS},
48 	{21, CharacterSet::Cp1250},
49 	{22, CharacterSet::Cp1251},
50 	{23, CharacterSet::Cp1252},
51 	{24, CharacterSet::Cp1256},
52 	{25, CharacterSet::UnicodeBig},
53 	{26, CharacterSet::UTF8},
54 	{27, CharacterSet::ASCII},
55 	{28, CharacterSet::Big5},
56 	{29, CharacterSet::GB18030},
57 	{30, CharacterSet::EUC_KR},
58 	{170, CharacterSet::ASCII},
59 	{899, CharacterSet::BINARY},
60 };
61 
62 struct CompareNoCase {
operator ()ZXing::CharacterSetECI::CompareNoCase63 	bool operator ()(const char* a, const char* b) const {
64 		while (*a != '\0' && *b != '\0') {
65 			auto ca = std::tolower(*a++);
66 			auto cb = std::tolower(*b++);
67 			if (ca < cb) {
68 				return true;
69 			}
70 			else if (ca > cb) {
71 				return false;
72 			}
73 		}
74 		return *a == '\0' && *b != '\0';
75 	}
76 };
77 
78 static const std::map<const char *, CharacterSet, CompareNoCase> ECI_NAME_TO_CHARSET = {
79 	{"Cp437",		CharacterSet::Cp437},
80 	{"ISO8859_1",	CharacterSet::ISO8859_1},
81 	{"ISO-8859-1",	CharacterSet::ISO8859_1},
82 	{"ISO8859_2",	CharacterSet::ISO8859_2},
83 	{"ISO-8859-2",	CharacterSet::ISO8859_2},
84 	{"ISO8859_3",	CharacterSet::ISO8859_3},
85 	{"ISO-8859-3",	CharacterSet::ISO8859_3},
86 	{"ISO8859_4",	CharacterSet::ISO8859_4},
87 	{"ISO-8859-4",	CharacterSet::ISO8859_4},
88 	{"ISO8859_5",	CharacterSet::ISO8859_5},
89 	{"ISO-8859-5",	CharacterSet::ISO8859_5},
90 	{"ISO8859_6",	CharacterSet::ISO8859_6},
91 	{"ISO-8859-6",	CharacterSet::ISO8859_6},
92 	{"ISO8859_7",	CharacterSet::ISO8859_7},
93 	{"ISO-8859-7",	CharacterSet::ISO8859_7},
94 	{"ISO8859_8",	CharacterSet::ISO8859_8},
95 	{"ISO-8859-8",	CharacterSet::ISO8859_8},
96 	{"ISO8859_9",	CharacterSet::ISO8859_9},
97 	{"ISO-8859-9",	CharacterSet::ISO8859_9},
98 	{"ISO8859_10",	CharacterSet::ISO8859_10},
99 	{"ISO-8859-10",	CharacterSet::ISO8859_10},
100 	{"ISO8859_11",	CharacterSet::ISO8859_11},
101 	{"ISO-8859-11",	CharacterSet::ISO8859_11},
102 	{"ISO8859_13",	CharacterSet::ISO8859_13},
103 	{"ISO-8859-13",	CharacterSet::ISO8859_13},
104 	{"ISO8859_14",	CharacterSet::ISO8859_14},
105 	{"ISO-8859-14",	CharacterSet::ISO8859_14},
106 	{"ISO8859_15",	CharacterSet::ISO8859_15},
107 	{"ISO-8859-15",	CharacterSet::ISO8859_15},
108 	{"ISO8859_16",	CharacterSet::ISO8859_16},
109 	{"ISO-8859-16",	CharacterSet::ISO8859_16},
110 	{"SJIS",		CharacterSet::Shift_JIS},
111 	{"Shift_JIS",	CharacterSet::Shift_JIS},
112 	{"Cp1250",		CharacterSet::Cp1250},
113 	{"windows-1250",CharacterSet::Cp1250},
114 	{"Cp1251",		CharacterSet::Cp1251},
115 	{"windows-1251",CharacterSet::Cp1251},
116 	{"Cp1252",		CharacterSet::Cp1252},
117 	{"windows-1252",CharacterSet::Cp1252},
118 	{"Cp1256",		CharacterSet::Cp1256},
119 	{"windows-1256",CharacterSet::Cp1256},
120 	{"UnicodeBigUnmarked", CharacterSet::UnicodeBig},
121 	{"UTF-16BE",	CharacterSet::UnicodeBig},
122 	{"UnicodeBig",	CharacterSet::UnicodeBig},
123 	{"UTF8",		CharacterSet::UTF8},
124 	{"UTF-8",		CharacterSet::UTF8},
125 	{"ASCII",		CharacterSet::ASCII},
126 	{"US-ASCII",	CharacterSet::ASCII},
127 	{"Big5",		CharacterSet::Big5},
128 	{"GB2312",		CharacterSet::GB2312},
129 	{"GB18030",		CharacterSet::GB18030},
130 	{"EUC_CN",		CharacterSet::GB18030},
131 	{"EUC-CN",		CharacterSet::GB18030},
132 	{"GBK",			CharacterSet::GB18030},
133 	{"EUC_KR",		CharacterSet::EUC_KR},
134 	{"EUC-KR",		CharacterSet::EUC_KR},
135 	{"BINARY",		CharacterSet::BINARY},
136 };
137 
CharsetFromValue(int value)138 CharacterSet CharsetFromValue(int value)
139 {
140 	auto it = ECI_VALUE_TO_CHARSET.find(value);
141 	if (it != ECI_VALUE_TO_CHARSET.end()) {
142 		return it->second;
143 	}
144 	return CharacterSet::Unknown;
145 }
146 
ValueForCharset(CharacterSet charset)147 int ValueForCharset(CharacterSet charset)
148 {
149 	// Special case ISO8859_1 to avoid obsolete ECI 1
150 	if (charset == CharacterSet::ISO8859_1) {
151 		return 3;
152 	}
153 	for (auto& [key, value] : ECI_VALUE_TO_CHARSET) {
154 		if (value == charset) {
155 			return key;
156 		}
157 	}
158 	return -1;
159 }
160 
CharsetFromName(const char * name)161 CharacterSet CharsetFromName(const char* name)
162 {
163 	auto it = ECI_NAME_TO_CHARSET.find(name);
164 	if (it != ECI_NAME_TO_CHARSET.end()) {
165 		return it->second;
166 	}
167 	return CharacterSet::Unknown;
168 }
169 
InitEncoding(const std::string & name,CharacterSet encodingDefault)170 CharacterSet InitEncoding(const std::string& name, CharacterSet encodingDefault)
171 {
172 	if (!name.empty()) {
173 		auto encodingInit = CharacterSetECI::CharsetFromName(name.c_str());
174 		if (encodingInit != CharacterSet::Unknown) {
175 			encodingDefault = encodingInit;
176 		}
177 	}
178 
179 	return encodingDefault;
180 }
181 
OnChangeAppendReset(const int eci,std::wstring & encoded,std::string & data,CharacterSet encoding)182 CharacterSet OnChangeAppendReset(const int eci, std::wstring& encoded, std::string& data, CharacterSet encoding)
183 {
184 	// Character set ECIs only
185 	if (eci >= 0 && eci <= 899) {
186 		auto encodingNew = CharacterSetECI::CharsetFromValue(eci);
187 		if (encodingNew != CharacterSet::Unknown && encodingNew != encoding) {
188 			// Encode data so far in current encoding and reset
189 			TextDecoder::Append(encoded, reinterpret_cast<const uint8_t*>(data.data()), data.size(), encoding);
190 			data.clear();
191 			encoding = encodingNew;
192 		}
193 	}
194 
195 	return encoding;
196 }
197 
198 } // namespace ZXing::CharacterSetECI
199