1 // Copyright 2017 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/font/cpdf_tounicodemap.h"
8 
9 #include <utility>
10 
11 #include "core/fpdfapi/font/cpdf_cid2unicodemap.h"
12 #include "core/fpdfapi/font/cpdf_fontglobals.h"
13 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
14 #include "core/fpdfapi/parser/cpdf_stream.h"
15 #include "core/fxcrt/fx_extension.h"
16 #include "core/fxcrt/fx_safe_types.h"
17 #include "third_party/base/numerics/safe_conversions.h"
18 
19 namespace {
20 
StringDataAdd(WideString str)21 WideString StringDataAdd(WideString str) {
22   WideString ret;
23   wchar_t value = 1;
24   for (size_t i = str.GetLength(); i > 0; --i) {
25     wchar_t ch = str[i - 1] + value;
26     if (ch < str[i - 1]) {
27       ret.InsertAtFront(0);
28     } else {
29       ret.InsertAtFront(ch);
30       value = 0;
31     }
32   }
33   if (value)
34     ret.InsertAtFront(value);
35   return ret;
36 }
37 
38 }  // namespace
39 
CPDF_ToUnicodeMap(const CPDF_Stream * pStream)40 CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(const CPDF_Stream* pStream) {
41   Load(pStream);
42 }
43 
44 CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default;
45 
Lookup(uint32_t charcode) const46 WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const {
47   auto it = m_Multimap.find(charcode);
48   if (it == m_Multimap.end()) {
49     if (!m_pBaseMap)
50       return WideString();
51     return m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode));
52   }
53 
54   uint32_t value = it->second;
55   wchar_t unicode = static_cast<wchar_t>(value & 0xffff);
56   if (unicode != 0xffff)
57     return unicode;
58 
59   WideStringView buf = m_MultiCharBuf.AsStringView();
60   size_t index = value >> 16;
61   if (!buf.IsValidIndex(index))
62     return WideString();
63   return WideString(buf.Substr(index + 1, buf[index]));
64 }
65 
ReverseLookup(wchar_t unicode) const66 uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const {
67   for (const auto& pair : m_Multimap) {
68     if (pair.second == static_cast<uint32_t>(unicode))
69       return pair.first;
70   }
71   return 0;
72 }
73 
74 // static
StringToCode(ByteStringView str)75 pdfium::Optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView str) {
76   size_t len = str.GetLength();
77   if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
78     return pdfium::nullopt;
79 
80   FX_SAFE_UINT32 code = 0;
81   for (char c : str.Substr(1, len - 2)) {
82     if (!FXSYS_IsHexDigit(c))
83       return pdfium::nullopt;
84 
85     code = code * 16 + FXSYS_HexCharToInt(c);
86     if (!code.IsValid())
87       return pdfium::nullopt;
88   }
89   return pdfium::Optional<uint32_t>(code.ValueOrDie());
90 }
91 
92 // static
StringToWideString(ByteStringView str)93 WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) {
94   size_t len = str.GetLength();
95   if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
96     return WideString();
97 
98   WideString result;
99   int byte_pos = 0;
100   wchar_t ch = 0;
101   for (char c : str.Substr(1, len - 2)) {
102     if (!FXSYS_IsHexDigit(c))
103       break;
104 
105     ch = ch * 16 + FXSYS_HexCharToInt(c);
106     byte_pos++;
107     if (byte_pos == 4) {
108       result += ch;
109       byte_pos = 0;
110       ch = 0;
111     }
112   }
113   return result;
114 }
115 
Load(const CPDF_Stream * pStream)116 void CPDF_ToUnicodeMap::Load(const CPDF_Stream* pStream) {
117   CIDSet cid_set = CIDSET_UNKNOWN;
118   auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(pStream);
119   pAcc->LoadAllDataFiltered();
120   CPDF_SimpleParser parser(pAcc->GetSpan());
121   while (1) {
122     ByteStringView word = parser.GetWord();
123     if (word.IsEmpty())
124       break;
125 
126     if (word == "beginbfchar")
127       HandleBeginBFChar(&parser);
128     else if (word == "beginbfrange")
129       HandleBeginBFRange(&parser);
130     else if (word == "/Adobe-Korea1-UCS2")
131       cid_set = CIDSET_KOREA1;
132     else if (word == "/Adobe-Japan1-UCS2")
133       cid_set = CIDSET_JAPAN1;
134     else if (word == "/Adobe-CNS1-UCS2")
135       cid_set = CIDSET_CNS1;
136     else if (word == "/Adobe-GB1-UCS2")
137       cid_set = CIDSET_GB1;
138   }
139   if (cid_set) {
140     auto* manager = CPDF_FontGlobals::GetInstance()->GetCMapManager();
141     m_pBaseMap = manager->GetCID2UnicodeMap(cid_set);
142   }
143 }
144 
HandleBeginBFChar(CPDF_SimpleParser * pParser)145 void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) {
146   while (1) {
147     ByteStringView word = pParser->GetWord();
148     if (word.IsEmpty() || word == "endbfchar")
149       return;
150 
151     pdfium::Optional<uint32_t> code = StringToCode(word);
152     if (!code.has_value())
153       return;
154 
155     SetCode(code.value(), StringToWideString(pParser->GetWord()));
156   }
157 }
158 
HandleBeginBFRange(CPDF_SimpleParser * pParser)159 void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) {
160   while (1) {
161     ByteStringView lowcode_str = pParser->GetWord();
162     if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange")
163       return;
164 
165     pdfium::Optional<uint32_t> lowcode_opt = StringToCode(lowcode_str);
166     if (!lowcode_opt.has_value())
167       return;
168 
169     ByteStringView highcode_str = pParser->GetWord();
170     pdfium::Optional<uint32_t> highcode_opt = StringToCode(highcode_str);
171     if (!highcode_opt.has_value())
172       return;
173 
174     uint32_t lowcode = lowcode_opt.value();
175     uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff);
176 
177     ByteStringView start = pParser->GetWord();
178     if (start == "[") {
179       for (uint32_t code = lowcode; code <= highcode; code++)
180         SetCode(code, StringToWideString(pParser->GetWord()));
181       pParser->GetWord();
182       continue;
183     }
184 
185     WideString destcode = StringToWideString(start);
186     if (destcode.GetLength() == 1) {
187       pdfium::Optional<uint32_t> value_or_error = StringToCode(start);
188       if (!value_or_error.has_value())
189         return;
190 
191       uint32_t value = value_or_error.value();
192       for (uint32_t code = lowcode; code <= highcode; code++)
193         m_Multimap.emplace(code, value++);
194     } else {
195       for (uint32_t code = lowcode; code <= highcode; code++) {
196         WideString retcode =
197             code == lowcode ? destcode : StringDataAdd(destcode);
198         m_Multimap.emplace(code, GetUnicode());
199         m_MultiCharBuf.AppendChar(retcode.GetLength());
200         m_MultiCharBuf << retcode;
201         destcode = std::move(retcode);
202       }
203     }
204   }
205 }
206 
GetUnicode() const207 uint32_t CPDF_ToUnicodeMap::GetUnicode() const {
208   FX_SAFE_UINT32 uni = m_MultiCharBuf.GetLength();
209   uni = uni * 0x10000 + 0xffff;
210   return uni.ValueOrDefault(0);
211 }
212 
SetCode(uint32_t srccode,WideString destcode)213 void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) {
214   size_t len = destcode.GetLength();
215   if (len == 0)
216     return;
217 
218   if (len == 1) {
219     m_Multimap.emplace(srccode, destcode[0]);
220   } else {
221     m_Multimap.emplace(srccode, GetUnicode());
222     m_MultiCharBuf.AppendChar(len);
223     m_MultiCharBuf << destcode;
224   }
225 }
226