1 // Copyright 2017 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/font/cpdf_cmap.h"
8 
9 #include <memory>
10 #include <utility>
11 #include <vector>
12 
13 #include "core/fpdfapi/cmaps/fpdf_cmaps.h"
14 #include "core/fpdfapi/font/cpdf_cmapparser.h"
15 #include "core/fpdfapi/font/cpdf_fontglobals.h"
16 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
17 
18 namespace {
19 
20 struct ByteRange {
21   uint8_t m_First;
22   uint8_t m_Last;  // Inclusive.
23 };
24 
25 struct PredefinedCMap {
26   const char* m_pName;  // Raw, POD struct.
27   CIDSet m_Charset;
28   CIDCoding m_Coding;
29   CPDF_CMap::CodingScheme m_CodingScheme;
30   uint8_t m_LeadingSegCount;
31   ByteRange m_LeadingSegs[2];
32 };
33 
34 constexpr PredefinedCMap kPredefinedCMaps[] = {
35     {"GB-EUC",
36      CIDSET_GB1,
37      CIDCODING_GB,
38      CPDF_CMap::MixedTwoBytes,
39      1,
40      {{0xa1, 0xfe}}},
41     {"GBpc-EUC",
42      CIDSET_GB1,
43      CIDCODING_GB,
44      CPDF_CMap::MixedTwoBytes,
45      1,
46      {{0xa1, 0xfc}}},
47     {"GBK-EUC",
48      CIDSET_GB1,
49      CIDCODING_GB,
50      CPDF_CMap::MixedTwoBytes,
51      1,
52      {{0x81, 0xfe}}},
53     {"GBKp-EUC",
54      CIDSET_GB1,
55      CIDCODING_GB,
56      CPDF_CMap::MixedTwoBytes,
57      1,
58      {{0x81, 0xfe}}},
59     {"GBK2K-EUC",
60      CIDSET_GB1,
61      CIDCODING_GB,
62      CPDF_CMap::MixedTwoBytes,
63      1,
64      {{0x81, 0xfe}}},
65     {"GBK2K",
66      CIDSET_GB1,
67      CIDCODING_GB,
68      CPDF_CMap::MixedTwoBytes,
69      1,
70      {{0x81, 0xfe}}},
71     {"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
72     {"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
73     {"B5pc",
74      CIDSET_CNS1,
75      CIDCODING_BIG5,
76      CPDF_CMap::MixedTwoBytes,
77      1,
78      {{0xa1, 0xfc}}},
79     {"HKscs-B5",
80      CIDSET_CNS1,
81      CIDCODING_BIG5,
82      CPDF_CMap::MixedTwoBytes,
83      1,
84      {{0x88, 0xfe}}},
85     {"ETen-B5",
86      CIDSET_CNS1,
87      CIDCODING_BIG5,
88      CPDF_CMap::MixedTwoBytes,
89      1,
90      {{0xa1, 0xfe}}},
91     {"ETenms-B5",
92      CIDSET_CNS1,
93      CIDCODING_BIG5,
94      CPDF_CMap::MixedTwoBytes,
95      1,
96      {{0xa1, 0xfe}}},
97     {"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
98     {"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
99     {"83pv-RKSJ",
100      CIDSET_JAPAN1,
101      CIDCODING_JIS,
102      CPDF_CMap::MixedTwoBytes,
103      2,
104      {{0x81, 0x9f}, {0xe0, 0xfc}}},
105     {"90ms-RKSJ",
106      CIDSET_JAPAN1,
107      CIDCODING_JIS,
108      CPDF_CMap::MixedTwoBytes,
109      2,
110      {{0x81, 0x9f}, {0xe0, 0xfc}}},
111     {"90msp-RKSJ",
112      CIDSET_JAPAN1,
113      CIDCODING_JIS,
114      CPDF_CMap::MixedTwoBytes,
115      2,
116      {{0x81, 0x9f}, {0xe0, 0xfc}}},
117     {"90pv-RKSJ",
118      CIDSET_JAPAN1,
119      CIDCODING_JIS,
120      CPDF_CMap::MixedTwoBytes,
121      2,
122      {{0x81, 0x9f}, {0xe0, 0xfc}}},
123     {"Add-RKSJ",
124      CIDSET_JAPAN1,
125      CIDCODING_JIS,
126      CPDF_CMap::MixedTwoBytes,
127      2,
128      {{0x81, 0x9f}, {0xe0, 0xfc}}},
129     {"EUC",
130      CIDSET_JAPAN1,
131      CIDCODING_JIS,
132      CPDF_CMap::MixedTwoBytes,
133      2,
134      {{0x8e, 0x8e}, {0xa1, 0xfe}}},
135     {"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
136     {"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
137     {"Ext-RKSJ",
138      CIDSET_JAPAN1,
139      CIDCODING_JIS,
140      CPDF_CMap::MixedTwoBytes,
141      2,
142      {{0x81, 0x9f}, {0xe0, 0xfc}}},
143     {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
144     {"UniJIS-UCS2-HW",
145      CIDSET_JAPAN1,
146      CIDCODING_UCS2,
147      CPDF_CMap::TwoBytes,
148      0,
149      {}},
150     {"UniJIS-UTF16",
151      CIDSET_JAPAN1,
152      CIDCODING_UTF16,
153      CPDF_CMap::TwoBytes,
154      0,
155      {}},
156     {"KSC-EUC",
157      CIDSET_KOREA1,
158      CIDCODING_KOREA,
159      CPDF_CMap::MixedTwoBytes,
160      1,
161      {{0xa1, 0xfe}}},
162     {"KSCms-UHC",
163      CIDSET_KOREA1,
164      CIDCODING_KOREA,
165      CPDF_CMap::MixedTwoBytes,
166      1,
167      {{0x81, 0xfe}}},
168     {"KSCms-UHC-HW",
169      CIDSET_KOREA1,
170      CIDCODING_KOREA,
171      CPDF_CMap::MixedTwoBytes,
172      1,
173      {{0x81, 0xfe}}},
174     {"KSCpc-EUC",
175      CIDSET_KOREA1,
176      CIDCODING_KOREA,
177      CPDF_CMap::MixedTwoBytes,
178      1,
179      {{0xa1, 0xfd}}},
180     {"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
181     {"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
182 };
183 
GetPredefinedCMap(ByteStringView cmapid)184 const PredefinedCMap* GetPredefinedCMap(ByteStringView cmapid) {
185   if (cmapid.GetLength() > 2)
186     cmapid = cmapid.First(cmapid.GetLength() - 2);
187   for (const auto& map : kPredefinedCMaps) {
188     if (cmapid == map.m_pName)
189       return &map;
190   }
191   return nullptr;
192 }
193 
LoadLeadingSegments(const PredefinedCMap & map)194 std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
195   std::vector<bool> segments(256);
196   for (uint32_t i = 0; i < map.m_LeadingSegCount; ++i) {
197     const ByteRange& seg = map.m_LeadingSegs[i];
198     for (int b = seg.m_First; b <= seg.m_Last; ++b)
199       segments[b] = true;
200   }
201   return segments;
202 }
203 
CheckFourByteCodeRange(uint8_t * codes,size_t size,const std::vector<CPDF_CMap::CodeRange> & ranges)204 int CheckFourByteCodeRange(uint8_t* codes,
205                            size_t size,
206                            const std::vector<CPDF_CMap::CodeRange>& ranges) {
207   for (size_t i = ranges.size(); i > 0; i--) {
208     size_t seg = i - 1;
209     if (ranges[seg].m_CharSize < size)
210       continue;
211     size_t iChar = 0;
212     while (iChar < size) {
213       if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
214           codes[iChar] > ranges[seg].m_Upper[iChar]) {
215         break;
216       }
217       ++iChar;
218     }
219     if (iChar == ranges[seg].m_CharSize)
220       return 2;
221     if (iChar)
222       return (size == ranges[seg].m_CharSize) ? 2 : 1;
223   }
224   return 0;
225 }
226 
GetFourByteCharSizeImpl(uint32_t charcode,const std::vector<CPDF_CMap::CodeRange> & ranges)227 size_t GetFourByteCharSizeImpl(
228     uint32_t charcode,
229     const std::vector<CPDF_CMap::CodeRange>& ranges) {
230   if (ranges.empty())
231     return 1;
232 
233   uint8_t codes[4];
234   codes[0] = codes[1] = 0x00;
235   codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
236   codes[3] = static_cast<uint8_t>(charcode);
237   for (size_t offset = 0; offset < 4; offset++) {
238     size_t size = 4 - offset;
239     for (size_t j = 0; j < ranges.size(); j++) {
240       size_t iSeg = (ranges.size() - 1) - j;
241       if (ranges[iSeg].m_CharSize < size)
242         continue;
243       size_t iChar = 0;
244       while (iChar < size) {
245         if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
246             codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
247           break;
248         }
249         ++iChar;
250       }
251       if (iChar == ranges[iSeg].m_CharSize)
252         return size;
253     }
254   }
255   return 1;
256 }
257 
258 }  // namespace
259 
CPDF_CMap(ByteStringView bsPredefinedName)260 CPDF_CMap::CPDF_CMap(ByteStringView bsPredefinedName)
261     : m_bVertical(bsPredefinedName.Back() == 'V') {
262   if (bsPredefinedName == "Identity-H" || bsPredefinedName == "Identity-V") {
263     m_Coding = CIDCODING_CID;
264     m_bLoaded = true;
265     return;
266   }
267 
268   const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
269   if (!map)
270     return;
271 
272   m_Charset = map->m_Charset;
273   m_Coding = map->m_Coding;
274   m_CodingScheme = map->m_CodingScheme;
275   if (m_CodingScheme == MixedTwoBytes)
276     m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
277   m_pEmbedMap = FindEmbeddedCMap(
278       CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
279       bsPredefinedName);
280   if (!m_pEmbedMap)
281     return;
282 
283   m_bLoaded = true;
284 }
285 
CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)286 CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
287     : m_DirectCharcodeToCIDTable(65536) {
288   CPDF_CMapParser parser(this);
289   CPDF_SimpleParser syntax(spEmbeddedData);
290   while (1) {
291     ByteStringView word = syntax.GetWord();
292     if (word.IsEmpty())
293       break;
294 
295     parser.ParseWord(word);
296   }
297 }
298 
299 CPDF_CMap::~CPDF_CMap() = default;
300 
CIDFromCharCode(uint32_t charcode) const301 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
302   if (m_Coding == CIDCODING_CID)
303     return static_cast<uint16_t>(charcode);
304 
305   if (m_pEmbedMap)
306     return ::CIDFromCharCode(m_pEmbedMap.Get(), charcode);
307 
308   if (m_DirectCharcodeToCIDTable.empty())
309     return static_cast<uint16_t>(charcode);
310 
311   if (charcode < 0x10000)
312     return m_DirectCharcodeToCIDTable[charcode];
313 
314   auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
315                              m_AdditionalCharcodeToCIDMappings.end(), charcode,
316                              [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
317                                return arg.m_EndCode < val;
318                              });
319   if (it == m_AdditionalCharcodeToCIDMappings.end() ||
320       it->m_StartCode > charcode) {
321     return 0;
322   }
323   return it->m_StartCID + charcode - it->m_StartCode;
324 }
325 
GetNextChar(ByteStringView pString,size_t * pOffset) const326 uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
327   size_t& offset = *pOffset;
328   auto pBytes = pString.raw_span();
329   switch (m_CodingScheme) {
330     case OneByte: {
331       return offset < pBytes.size() ? pBytes[offset++] : 0;
332     }
333     case TwoBytes: {
334       uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
335       uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
336       return 256 * byte1 + byte2;
337     }
338     case MixedTwoBytes: {
339       uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
340       if (!m_MixedTwoByteLeadingBytes[byte1])
341         return byte1;
342       uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
343       return 256 * byte1 + byte2;
344     }
345     case MixedFourBytes: {
346       uint8_t codes[4];
347       int char_size = 1;
348       codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
349       while (1) {
350         int ret = CheckFourByteCodeRange(codes, char_size,
351                                          m_MixedFourByteLeadingRanges);
352         if (ret == 0)
353           return 0;
354         if (ret == 2) {
355           uint32_t charcode = 0;
356           for (int i = 0; i < char_size; i++)
357             charcode = (charcode << 8) + codes[i];
358           return charcode;
359         }
360         if (char_size == 4 || offset == pBytes.size())
361           return 0;
362         codes[char_size++] = pBytes[offset++];
363       }
364       break;
365     }
366   }
367   return 0;
368 }
369 
GetCharSize(uint32_t charcode) const370 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
371   switch (m_CodingScheme) {
372     case OneByte:
373       return 1;
374     case TwoBytes:
375       return 2;
376     case MixedTwoBytes:
377       if (charcode < 0x100)
378         return 1;
379       return 2;
380     case MixedFourBytes:
381       if (charcode < 0x100)
382         return 1;
383       if (charcode < 0x10000)
384         return 2;
385       if (charcode < 0x1000000)
386         return 3;
387       return 4;
388   }
389   return 1;
390 }
391 
CountChar(ByteStringView pString) const392 size_t CPDF_CMap::CountChar(ByteStringView pString) const {
393   switch (m_CodingScheme) {
394     case OneByte:
395       return pString.GetLength();
396     case TwoBytes:
397       return (pString.GetLength() + 1) / 2;
398     case MixedTwoBytes: {
399       size_t count = 0;
400       for (size_t i = 0; i < pString.GetLength(); i++) {
401         count++;
402         if (m_MixedTwoByteLeadingBytes[pString[i]])
403           i++;
404       }
405       return count;
406     }
407     case MixedFourBytes: {
408       size_t count = 0;
409       size_t offset = 0;
410       while (offset < pString.GetLength()) {
411         GetNextChar(pString, &offset);
412         count++;
413       }
414       return count;
415     }
416   }
417   return pString.GetLength();
418 }
419 
AppendChar(char * str,uint32_t charcode) const420 int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
421   switch (m_CodingScheme) {
422     case OneByte:
423       str[0] = static_cast<char>(charcode);
424       return 1;
425     case TwoBytes:
426       str[0] = static_cast<char>(charcode / 256);
427       str[1] = static_cast<char>(charcode % 256);
428       return 2;
429     case MixedTwoBytes:
430       if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
431         str[0] = static_cast<char>(charcode);
432         return 1;
433       }
434       str[0] = static_cast<char>(charcode >> 8);
435       str[1] = static_cast<char>(charcode);
436       return 2;
437     case MixedFourBytes:
438       if (charcode < 0x100) {
439         int iSize = static_cast<int>(
440             GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
441         if (iSize == 0)
442           iSize = 1;
443         str[iSize - 1] = static_cast<char>(charcode);
444         if (iSize > 1)
445           memset(str, 0, iSize - 1);
446         return iSize;
447       }
448       if (charcode < 0x10000) {
449         str[0] = static_cast<char>(charcode >> 8);
450         str[1] = static_cast<char>(charcode);
451         return 2;
452       }
453       if (charcode < 0x1000000) {
454         str[0] = static_cast<char>(charcode >> 16);
455         str[1] = static_cast<char>(charcode >> 8);
456         str[2] = static_cast<char>(charcode);
457         return 3;
458       }
459       str[0] = static_cast<char>(charcode >> 24);
460       str[1] = static_cast<char>(charcode >> 16);
461       str[2] = static_cast<char>(charcode >> 8);
462       str[3] = static_cast<char>(charcode);
463       return 4;
464   }
465   return 0;
466 }
467 
SetAdditionalMappings(std::vector<CIDRange> mappings)468 void CPDF_CMap::SetAdditionalMappings(std::vector<CIDRange> mappings) {
469   ASSERT(m_AdditionalCharcodeToCIDMappings.empty());
470   if (m_CodingScheme != MixedFourBytes || mappings.empty())
471     return;
472 
473   std::sort(
474       mappings.begin(), mappings.end(),
475       [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
476         return arg1.m_EndCode < arg2.m_EndCode;
477       });
478   m_AdditionalCharcodeToCIDMappings = std::move(mappings);
479 }
480 
SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges)481 void CPDF_CMap::SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges) {
482   m_MixedFourByteLeadingRanges = std::move(ranges);
483 }
484