1 // Copyright 2017 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/font/cpdf_tounicodemap.h"
8
9 #include <utility>
10
11 #include "core/fpdfapi/font/cpdf_cid2unicodemap.h"
12 #include "core/fpdfapi/font/cpdf_fontglobals.h"
13 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
14 #include "core/fpdfapi/parser/cpdf_stream.h"
15 #include "core/fxcrt/fx_extension.h"
16 #include "core/fxcrt/fx_safe_types.h"
17 #include "third_party/base/numerics/safe_conversions.h"
18
19 namespace {
20
StringDataAdd(WideString str)21 WideString StringDataAdd(WideString str) {
22 WideString ret;
23 wchar_t value = 1;
24 for (size_t i = str.GetLength(); i > 0; --i) {
25 wchar_t ch = str[i - 1] + value;
26 if (ch < str[i - 1]) {
27 ret.InsertAtFront(0);
28 } else {
29 ret.InsertAtFront(ch);
30 value = 0;
31 }
32 }
33 if (value)
34 ret.InsertAtFront(value);
35 return ret;
36 }
37
38 } // namespace
39
CPDF_ToUnicodeMap(const CPDF_Stream * pStream)40 CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(const CPDF_Stream* pStream) {
41 Load(pStream);
42 }
43
44 CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default;
45
Lookup(uint32_t charcode) const46 WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const {
47 auto it = m_Multimap.find(charcode);
48 if (it == m_Multimap.end()) {
49 if (!m_pBaseMap)
50 return WideString();
51 return m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode));
52 }
53
54 uint32_t value = it->second;
55 wchar_t unicode = static_cast<wchar_t>(value & 0xffff);
56 if (unicode != 0xffff)
57 return unicode;
58
59 WideStringView buf = m_MultiCharBuf.AsStringView();
60 size_t index = value >> 16;
61 if (!buf.IsValidIndex(index))
62 return WideString();
63 return WideString(buf.Substr(index + 1, buf[index]));
64 }
65
ReverseLookup(wchar_t unicode) const66 uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const {
67 for (const auto& pair : m_Multimap) {
68 if (pair.second == static_cast<uint32_t>(unicode))
69 return pair.first;
70 }
71 return 0;
72 }
73
74 // static
StringToCode(ByteStringView str)75 pdfium::Optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView str) {
76 size_t len = str.GetLength();
77 if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
78 return pdfium::nullopt;
79
80 FX_SAFE_UINT32 code = 0;
81 for (char c : str.Substr(1, len - 2)) {
82 if (!FXSYS_IsHexDigit(c))
83 return pdfium::nullopt;
84
85 code = code * 16 + FXSYS_HexCharToInt(c);
86 if (!code.IsValid())
87 return pdfium::nullopt;
88 }
89 return pdfium::Optional<uint32_t>(code.ValueOrDie());
90 }
91
92 // static
StringToWideString(ByteStringView str)93 WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) {
94 size_t len = str.GetLength();
95 if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
96 return WideString();
97
98 WideString result;
99 int byte_pos = 0;
100 wchar_t ch = 0;
101 for (char c : str.Substr(1, len - 2)) {
102 if (!FXSYS_IsHexDigit(c))
103 break;
104
105 ch = ch * 16 + FXSYS_HexCharToInt(c);
106 byte_pos++;
107 if (byte_pos == 4) {
108 result += ch;
109 byte_pos = 0;
110 ch = 0;
111 }
112 }
113 return result;
114 }
115
Load(const CPDF_Stream * pStream)116 void CPDF_ToUnicodeMap::Load(const CPDF_Stream* pStream) {
117 CIDSet cid_set = CIDSET_UNKNOWN;
118 auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(pStream);
119 pAcc->LoadAllDataFiltered();
120 CPDF_SimpleParser parser(pAcc->GetSpan());
121 while (1) {
122 ByteStringView word = parser.GetWord();
123 if (word.IsEmpty())
124 break;
125
126 if (word == "beginbfchar")
127 HandleBeginBFChar(&parser);
128 else if (word == "beginbfrange")
129 HandleBeginBFRange(&parser);
130 else if (word == "/Adobe-Korea1-UCS2")
131 cid_set = CIDSET_KOREA1;
132 else if (word == "/Adobe-Japan1-UCS2")
133 cid_set = CIDSET_JAPAN1;
134 else if (word == "/Adobe-CNS1-UCS2")
135 cid_set = CIDSET_CNS1;
136 else if (word == "/Adobe-GB1-UCS2")
137 cid_set = CIDSET_GB1;
138 }
139 if (cid_set) {
140 auto* manager = CPDF_FontGlobals::GetInstance()->GetCMapManager();
141 m_pBaseMap = manager->GetCID2UnicodeMap(cid_set);
142 }
143 }
144
HandleBeginBFChar(CPDF_SimpleParser * pParser)145 void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) {
146 while (1) {
147 ByteStringView word = pParser->GetWord();
148 if (word.IsEmpty() || word == "endbfchar")
149 return;
150
151 pdfium::Optional<uint32_t> code = StringToCode(word);
152 if (!code.has_value())
153 return;
154
155 SetCode(code.value(), StringToWideString(pParser->GetWord()));
156 }
157 }
158
HandleBeginBFRange(CPDF_SimpleParser * pParser)159 void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) {
160 while (1) {
161 ByteStringView lowcode_str = pParser->GetWord();
162 if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange")
163 return;
164
165 pdfium::Optional<uint32_t> lowcode_opt = StringToCode(lowcode_str);
166 if (!lowcode_opt.has_value())
167 return;
168
169 ByteStringView highcode_str = pParser->GetWord();
170 pdfium::Optional<uint32_t> highcode_opt = StringToCode(highcode_str);
171 if (!highcode_opt.has_value())
172 return;
173
174 uint32_t lowcode = lowcode_opt.value();
175 uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff);
176
177 ByteStringView start = pParser->GetWord();
178 if (start == "[") {
179 for (uint32_t code = lowcode; code <= highcode; code++)
180 SetCode(code, StringToWideString(pParser->GetWord()));
181 pParser->GetWord();
182 continue;
183 }
184
185 WideString destcode = StringToWideString(start);
186 if (destcode.GetLength() == 1) {
187 pdfium::Optional<uint32_t> value_or_error = StringToCode(start);
188 if (!value_or_error.has_value())
189 return;
190
191 uint32_t value = value_or_error.value();
192 for (uint32_t code = lowcode; code <= highcode; code++)
193 m_Multimap.emplace(code, value++);
194 } else {
195 for (uint32_t code = lowcode; code <= highcode; code++) {
196 WideString retcode =
197 code == lowcode ? destcode : StringDataAdd(destcode);
198 m_Multimap.emplace(code, GetUnicode());
199 m_MultiCharBuf.AppendChar(retcode.GetLength());
200 m_MultiCharBuf << retcode;
201 destcode = std::move(retcode);
202 }
203 }
204 }
205 }
206
GetUnicode() const207 uint32_t CPDF_ToUnicodeMap::GetUnicode() const {
208 FX_SAFE_UINT32 uni = m_MultiCharBuf.GetLength();
209 uni = uni * 0x10000 + 0xffff;
210 return uni.ValueOrDefault(0);
211 }
212
SetCode(uint32_t srccode,WideString destcode)213 void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) {
214 size_t len = destcode.GetLength();
215 if (len == 0)
216 return;
217
218 if (len == 1) {
219 m_Multimap.emplace(srccode, destcode[0]);
220 } else {
221 m_Multimap.emplace(srccode, GetUnicode());
222 m_MultiCharBuf.AppendChar(len);
223 m_MultiCharBuf << destcode;
224 }
225 }
226