1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_linkextract.h"
8 
9 #include <vector>
10 
11 #include "core/fpdftext/cpdf_textpage.h"
12 #include "core/fxcrt/fx_extension.h"
13 #include "core/fxcrt/fx_string.h"
14 #include "core/fxcrt/fx_system.h"
15 
16 namespace {
17 
18 // Find the end of a web link starting from offset |start| and ending at offset
19 // |end|. The purpose of this function is to separate url from the surrounding
20 // context characters, we do not intend to fully validate the url. |str|
21 // contains lower case characters only.
FindWebLinkEnding(const WideString & str,size_t start,size_t end)22 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
23   if (str.Contains(L'/', start)) {
24     // When there is a path and query after '/', most ASCII chars are allowed.
25     // We don't sanitize in this case.
26     return end;
27   }
28 
29   // When there is no path, it only has IP address or host name.
30   // Port is optional at the end.
31   if (str[start] == L'[') {
32     // IPv6 reference.
33     // Find the end of the reference.
34     auto result = str.Find(L']', start + 1);
35     if (result.has_value()) {
36       end = result.value();
37       if (end > start + 1) {  // Has content inside brackets.
38         size_t len = str.GetLength();
39         size_t off = end + 1;
40         if (off < len && str[off] == L':') {
41           off++;
42           while (off < len && FXSYS_IsDecimalDigit(str[off]))
43             off++;
44           if (off > end + 2 &&
45               off <= len)   // At least one digit in port number.
46             end = off - 1;  // |off| is offset of the first invalid char.
47         }
48       }
49     }
50     return end;
51   }
52 
53   // According to RFC1123, host name only has alphanumeric chars, hyphens,
54   // and periods. Hyphen should not at the end though.
55   // Non-ASCII chars are ignored during checking.
56   while (end > start && str[end] < 0x80) {
57     if (FXSYS_IsDecimalDigit(str[end]) ||
58         (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') {
59       break;
60     }
61     end--;
62   }
63   return end;
64 }
65 
66 // Remove characters from the end of |str|, delimited by |start| and |end|, up
67 // to and including |charToFind|. No-op if |charToFind| is not present. Updates
68 // |end| if characters were removed.
TrimBackwardsToChar(const WideString & str,wchar_t charToFind,size_t start,size_t * end)69 void TrimBackwardsToChar(const WideString& str,
70                          wchar_t charToFind,
71                          size_t start,
72                          size_t* end) {
73   for (size_t pos = *end; pos >= start; pos--) {
74     if (str[pos] == charToFind) {
75       *end = pos - 1;
76       break;
77     }
78   }
79 }
80 
81 // Finds opening brackets ()[]{}<> and quotes "'  before the URL delimited by
82 // |start| and |end| in |str|. Matches a closing bracket or quote for each
83 // opening character and, if present, removes everything afterwards. Returns the
84 // new end position for the string.
TrimExternalBracketsFromWebLink(const WideString & str,size_t start,size_t end)85 size_t TrimExternalBracketsFromWebLink(const WideString& str,
86                                        size_t start,
87                                        size_t end) {
88   for (size_t pos = 0; pos < start; pos++) {
89     if (str[pos] == '(') {
90       TrimBackwardsToChar(str, ')', start, &end);
91     } else if (str[pos] == '[') {
92       TrimBackwardsToChar(str, ']', start, &end);
93     } else if (str[pos] == '{') {
94       TrimBackwardsToChar(str, '}', start, &end);
95     } else if (str[pos] == '<') {
96       TrimBackwardsToChar(str, '>', start, &end);
97     } else if (str[pos] == '"') {
98       TrimBackwardsToChar(str, '"', start, &end);
99     } else if (str[pos] == '\'') {
100       TrimBackwardsToChar(str, '\'', start, &end);
101     }
102   }
103   return end;
104 }
105 
106 }  // namespace
107 
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)108 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
109     : m_pTextPage(pTextPage) {}
110 
111 CPDF_LinkExtract::~CPDF_LinkExtract() = default;
112 
ExtractLinks()113 void CPDF_LinkExtract::ExtractLinks() {
114   m_LinkArray.clear();
115   int start = 0;
116   int pos = 0;
117   bool bAfterHyphen = false;
118   bool bLineBreak = false;
119   const int nTotalChar = m_pTextPage->CountChars();
120   const WideString page_text = m_pTextPage->GetAllPageText();
121   while (pos < nTotalChar) {
122     const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
123     if (char_info.m_CharType != CPDF_TextPage::CharType::kGenerated &&
124         char_info.m_Unicode != L' ' && pos != nTotalChar - 1) {
125       bAfterHyphen =
126           (char_info.m_CharType == CPDF_TextPage::CharType::kHyphen ||
127            (char_info.m_CharType == CPDF_TextPage::CharType::kNormal &&
128             char_info.m_Unicode == L'-'));
129       ++pos;
130       continue;
131     }
132 
133     int nCount = pos - start;
134     if (pos == nTotalChar - 1) {
135       ++nCount;
136     } else if (bAfterHyphen &&
137                (char_info.m_Unicode == L'\n' || char_info.m_Unicode == L'\r')) {
138       // Handle text breaks with a hyphen to the next line.
139       bLineBreak = true;
140       ++pos;
141       continue;
142     }
143 
144     WideString strBeCheck = page_text.Substr(start, nCount);
145     if (bLineBreak) {
146       strBeCheck.Remove(L'\n');
147       strBeCheck.Remove(L'\r');
148       bLineBreak = false;
149     }
150     // Replace the generated code with the hyphen char.
151     strBeCheck.Replace(L"\xfffe", L"-");
152 
153     if (strBeCheck.GetLength() > 5) {
154       while (strBeCheck.GetLength() > 0) {
155         wchar_t ch = strBeCheck.Back();
156         if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.')
157           break;
158 
159         strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1);
160         nCount--;
161       }
162 
163       // Check for potential web URLs and email addresses.
164       // Ftp address, file system links, data, blob etc. are not checked.
165       if (nCount > 5) {
166         int32_t nStartOffset;
167         int32_t nCountOverload;
168         if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
169           m_LinkArray.push_back(
170               {start + nStartOffset, nCountOverload, strBeCheck});
171         } else if (CheckMailLink(&strBeCheck)) {
172           m_LinkArray.push_back({start, nCount, strBeCheck});
173         }
174       }
175     }
176     start = ++pos;
177   }
178 }
179 
CheckWebLink(WideString * strBeCheck,int32_t * nStart,int32_t * nCount)180 bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck,
181                                     int32_t* nStart,
182                                     int32_t* nCount) {
183   static const wchar_t kHttpScheme[] = L"http";
184   static const wchar_t kWWWAddrStart[] = L"www.";
185 
186   const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
187   const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
188 
189   WideString str = *strBeCheck;
190   str.MakeLower();
191 
192   size_t len = str.GetLength();
193   // First, try to find the scheme.
194   auto start = str.Find(kHttpScheme);
195   if (start.has_value()) {
196     size_t off = start.value() + kHttpSchemeLen;  // move after "http".
197     if (len > off + 4) {     // At least "://<char>" follows.
198       if (str[off] == L's')  // "https" scheme is accepted.
199         off++;
200       if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
201         off += 3;
202         size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
203                                                      str.GetLength() - 1);
204         end = FindWebLinkEnding(str, off, end);
205         if (end > off) {  // Non-empty host name.
206           *nStart = start.value();
207           *nCount = end - start.value() + 1;
208           *strBeCheck = strBeCheck->Substr(*nStart, *nCount);
209           return true;
210         }
211       }
212     }
213   }
214 
215   // When there is no scheme, try to find url starting with "www.".
216   start = str.Find(kWWWAddrStart);
217   if (start.has_value() && len > start.value() + kWWWAddrStartLen) {
218     size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
219                                                  str.GetLength() - 1);
220     end = FindWebLinkEnding(str, start.value(), end);
221     if (end > start.value() + kWWWAddrStartLen) {
222       *nStart = start.value();
223       *nCount = end - start.value() + 1;
224       *strBeCheck = L"http://" + strBeCheck->Substr(*nStart, *nCount);
225       return true;
226     }
227   }
228   return false;
229 }
230 
CheckMailLink(WideString * str)231 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
232   auto aPos = str->Find(L'@');
233   // Invalid when no '@' or when starts/ends with '@'.
234   if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
235     return false;
236 
237   // Check the local part.
238   size_t pPos = aPos.value();  // Used to track the position of '@' or '.'.
239   for (size_t i = aPos.value(); i > 0; i--) {
240     wchar_t ch = (*str)[i - 1];
241     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
242       continue;
243 
244     if (ch != L'.' || i == pPos || i == 1) {
245       if (i == aPos.value()) {
246         // There is '.' or invalid char before '@'.
247         return false;
248       }
249       // End extracting for other invalid chars, '.' at the beginning, or
250       // consecutive '.'.
251       size_t removed_len = i == pPos ? i + 1 : i;
252       *str = str->Last(str->GetLength() - removed_len);
253       break;
254     }
255     // Found a valid '.'.
256     pPos = i - 1;
257   }
258 
259   // Check the domain name part.
260   aPos = str->Find(L'@');
261   if (!aPos.has_value() || aPos.value() == 0)
262     return false;
263 
264   str->TrimRight(L'.');
265   // At least one '.' in domain name, but not at the beginning.
266   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
267   // Check whether we should remove this check.
268   auto ePos = str->Find(L'.', aPos.value() + 1);
269   if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
270     return false;
271 
272   // Validate all other chars in domain name.
273   size_t nLen = str->GetLength();
274   pPos = 0;  // Used to track the position of '.'.
275   for (size_t i = aPos.value() + 1; i < nLen; i++) {
276     wchar_t wch = (*str)[i];
277     if (wch == L'-' || FXSYS_iswalnum(wch))
278       continue;
279 
280     if (wch != L'.' || i == pPos + 1) {
281       // Domain name should end before invalid char.
282       size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
283       if (pPos > 0 && host_end - aPos.value() >= 3) {
284         // Trim the ending invalid chars if there is at least one '.' and name.
285         *str = str->First(host_end + 1);
286         break;
287       }
288       return false;
289     }
290     pPos = i;
291   }
292 
293   if (!str->Contains(L"mailto:"))
294     *str = L"mailto:" + *str;
295 
296   return true;
297 }
298 
GetURL(size_t index) const299 WideString CPDF_LinkExtract::GetURL(size_t index) const {
300   return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl
301                                     : WideString();
302 }
303 
GetRects(size_t index) const304 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
305   if (index >= m_LinkArray.size())
306     return std::vector<CFX_FloatRect>();
307 
308   return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
309                                    m_LinkArray[index].m_Count);
310 }
311 
GetTextRange(size_t index,int * start_char_index,int * char_count) const312 bool CPDF_LinkExtract::GetTextRange(size_t index,
313                                     int* start_char_index,
314                                     int* char_count) const {
315   if (index >= m_LinkArray.size())
316     return false;
317   *start_char_index = m_LinkArray[index].m_Start;
318   *char_count = m_LinkArray[index].m_Count;
319   return true;
320 }
321