1 use super::encodings::{self, bytes_to_string, string_to_bytes};
2 use super::{Dictionary, Object, ObjectId};
3 use crate::xref::Xref;
4 use crate::{Error, Result};
5 use encoding::all::UTF_16BE;
6 use encoding::types::{DecoderTrap, EncoderTrap, Encoding};
7 use log::info;
8 use std::cmp::max;
9 use std::collections::BTreeMap;
10 use std::io::Write;
11 use std::str;
12 
13 /// PDF document.
14 #[derive(Debug, Clone)]
15 pub struct Document {
16     /// The version of the PDF specification to which the file conforms.
17     pub version: String,
18 
19     /// The trailer gives the location of the cross-reference table and of certain special objects.
20     pub trailer: Dictionary,
21 
22     /// The cross-reference table contains locations of the indirect objects.
23     pub reference_table: Xref,
24 
25     /// The objects that make up the document contained in the file.
26     pub objects: BTreeMap<ObjectId, Object>,
27 
28     /// Current maximum object id within the document.
29     pub max_id: u32,
30 }
31 
32 impl Document {
33     /// Create new PDF document.
new() -> Document34     pub fn new() -> Document {
35         Document {
36             version: "1.4".to_string(),
37             trailer: Dictionary::new(),
38             reference_table: Xref::new(0),
39             objects: BTreeMap::new(),
40             max_id: 0,
41         }
42     }
43 
44     const DEREF_LIMIT: usize = 128;
45 
46     /// Follow references if the supplied object is a reference.
47     ///
48     /// Returns a tuple of an optional object id and final object.
49     /// The object id will be None if the object was not a
50     /// reference. Otherwise, it will be the last object id in the
51     /// reference chain.
dereference<'a>(&'a self, mut object: &'a Object) -> Result<(Option<ObjectId>, &'a Object)>52     pub fn dereference<'a>(&'a self, mut object: &'a Object) -> Result<(Option<ObjectId>, &'a Object)> {
53         let mut nb_deref = 0;
54         let mut id = None;
55 
56         while let Ok(ref_id) = object.as_reference() {
57             id = Some(ref_id);
58             object = self.objects.get(&ref_id).ok_or(Error::ObjectNotFound)?;
59 
60             nb_deref += 1;
61             if nb_deref > Self::DEREF_LIMIT {
62                 return Err(Error::ReferenceLimit);
63             }
64         }
65 
66         Ok((id, object))
67     }
68 
69     /// Get object by object id, will iteratively dereference a referenced object.
get_object(&self, id: ObjectId) -> Result<&Object>70     pub fn get_object(&self, id: ObjectId) -> Result<&Object> {
71         let object = self.objects.get(&id).ok_or(Error::ObjectNotFound)?;
72         self.dereference(object).map(|(_, object)| object)
73     }
74 
75     /// Get mutable reference to object by object id, will iteratively dereference a referenced object.
get_object_mut(&mut self, id: ObjectId) -> Result<&mut Object>76     pub fn get_object_mut(&mut self, id: ObjectId) -> Result<&mut Object> {
77         let object = self.objects.get(&id).ok_or(Error::ObjectNotFound)?;
78         let (ref_id, _) = self.dereference(object)?;
79 
80         Ok(self.objects.get_mut(&ref_id.unwrap_or(id)).unwrap())
81     }
82 
83     /// Get page object_id of the specified object object_id
get_object_page(&self, id: ObjectId) -> Result<ObjectId>84     pub fn get_object_page(&self, id: ObjectId) -> Result<ObjectId> {
85         for (_, object_id) in self.get_pages() {
86             let page = self.get_object(object_id)?.as_dict()?;
87             let annots = page.get(b"Annots")?.as_array()?;
88             let objects_ids = annots.iter().map(|object| object.as_reference()).collect::<Vec<_>>();
89 
90             if objects_ids.iter().any(|object_id| {
91                 if let Ok(object_id) = object_id {
92                     return id == *object_id;
93                 }
94 
95                 false
96             }) {
97                 return Ok(object_id);
98             }
99         }
100 
101         Err(Error::ObjectNotFound)
102     }
103 
104     /// Get dictionary object by id.
get_dictionary(&self, id: ObjectId) -> Result<&Dictionary>105     pub fn get_dictionary(&self, id: ObjectId) -> Result<&Dictionary> {
106         self.get_object(id).and_then(Object::as_dict)
107     }
108 
109     /// Traverse objects from trailer recursively, return all referenced object IDs.
traverse_objects<A: Fn(&mut Object) -> ()>(&mut self, action: A) -> Vec<ObjectId>110     pub fn traverse_objects<A: Fn(&mut Object) -> ()>(&mut self, action: A) -> Vec<ObjectId> {
111         fn traverse_array<A: Fn(&mut Object) -> ()>(array: &mut Vec<Object>, action: &A, refs: &mut Vec<ObjectId>) {
112             for item in array.iter_mut() {
113                 traverse_object(item, action, refs);
114             }
115         }
116         fn traverse_dictionary<A: Fn(&mut Object) -> ()>(dict: &mut Dictionary, action: &A, refs: &mut Vec<ObjectId>) {
117             for (_, v) in dict.iter_mut() {
118                 traverse_object(v, action, refs);
119             }
120         }
121         fn traverse_object<A: Fn(&mut Object) -> ()>(object: &mut Object, action: &A, refs: &mut Vec<ObjectId>) {
122             action(object);
123             match *object {
124                 Object::Array(ref mut array) => traverse_array(array, action, refs),
125                 Object::Dictionary(ref mut dict) => traverse_dictionary(dict, action, refs),
126                 Object::Stream(ref mut stream) => traverse_dictionary(&mut stream.dict, action, refs),
127                 Object::Reference(id) => {
128                     if !refs.contains(&id) {
129                         refs.push(id);
130                     }
131                 }
132                 _ => {}
133             }
134         }
135         let mut refs = vec![];
136         traverse_dictionary(&mut self.trailer, &action, &mut refs);
137         let mut index = 0;
138         while index < refs.len() {
139             if let Some(object) = self.objects.get_mut(&refs[index]) {
140                 traverse_object(object, &action, &mut refs);
141             }
142             index += 1;
143         }
144         refs
145     }
146 
147     /// Get catalog dictionary.
catalog(&self) -> Result<&Dictionary>148     pub fn catalog(&self) -> Result<&Dictionary> {
149         self.trailer
150             .get(b"Root")
151             .and_then(Object::as_reference)
152             .and_then(|id| self.get_dictionary(id))
153     }
154 
155     /// Get page numbers and corresponding object ids.
get_pages(&self) -> BTreeMap<u32, ObjectId>156     pub fn get_pages(&self) -> BTreeMap<u32, ObjectId> {
157         self.page_iter().enumerate().map(|(i, p)| ((i + 1) as u32, p)).collect()
158     }
159 
page_iter(&self) -> impl Iterator<Item = ObjectId> + '_160     pub fn page_iter(&self) -> impl Iterator<Item = ObjectId> + '_ {
161         PageTreeIter::new(self)
162     }
163 
164     /// Get content stream object ids of a page.
get_page_contents(&self, page_id: ObjectId) -> Vec<ObjectId>165     pub fn get_page_contents(&self, page_id: ObjectId) -> Vec<ObjectId> {
166         let mut streams = vec![];
167         if let Ok(page) = self.get_dictionary(page_id) {
168             if let Ok(contents) = page.get(b"Contents") {
169                 match *contents {
170                     Object::Reference(ref id) => {
171                         streams.push(*id);
172                     }
173                     Object::Array(ref arr) => {
174                         for content in arr {
175                             if let Ok(id) = content.as_reference() {
176                                 streams.push(id)
177                             }
178                         }
179                     }
180                     _ => {}
181                 }
182             }
183         }
184         streams
185     }
186 
187     /// Get content of a page.
get_page_content(&self, page_id: ObjectId) -> Result<Vec<u8>>188     pub fn get_page_content(&self, page_id: ObjectId) -> Result<Vec<u8>> {
189         let mut content = Vec::new();
190         let content_streams = self.get_page_contents(page_id);
191         for object_id in content_streams {
192             if let Ok(content_stream) = self.get_object(object_id).and_then(Object::as_stream) {
193                 match content_stream.decompressed_content() {
194                     Ok(data) => content.write_all(&data)?,
195                     Err(_) => content.write_all(&content_stream.content)?,
196                 };
197             }
198         }
199         Ok(content)
200     }
201 
202     /// Get resources used by a page.
get_page_resources(&self, page_id: ObjectId) -> (Option<&Dictionary>, Vec<ObjectId>)203     pub fn get_page_resources(&self, page_id: ObjectId) -> (Option<&Dictionary>, Vec<ObjectId>) {
204         fn collect_resources(page_node: &Dictionary, resource_ids: &mut Vec<ObjectId>, doc: &Document) {
205             if let Ok(resources_id) = page_node.get(b"Resources").and_then(Object::as_reference) {
206                 resource_ids.push(resources_id);
207             }
208             if let Ok(page_tree) = page_node
209                 .get(b"Parent")
210                 .and_then(Object::as_reference)
211                 .and_then(|id| doc.get_dictionary(id))
212             {
213                 collect_resources(page_tree, resource_ids, doc);
214             }
215         };
216 
217         let mut resource_dict = None;
218         let mut resource_ids = Vec::new();
219         if let Ok(page) = self.get_dictionary(page_id) {
220             resource_dict = page.get(b"Resources").and_then(Object::as_dict).ok();
221             collect_resources(page, &mut resource_ids, self);
222         }
223         (resource_dict, resource_ids)
224     }
225 
226     /// Get fonts used by a page.
get_page_fonts(&self, page_id: ObjectId) -> BTreeMap<Vec<u8>, &Dictionary>227     pub fn get_page_fonts(&self, page_id: ObjectId) -> BTreeMap<Vec<u8>, &Dictionary> {
228         fn collect_fonts_from_resources<'a>(
229             resources: &'a Dictionary, fonts: &mut BTreeMap<Vec<u8>, &'a Dictionary>, doc: &'a Document,
230         ) {
231             if let Ok(font_dict) = resources.get(b"Font").and_then(Object::as_dict) {
232                 for (name, value) in font_dict.iter() {
233                     let font = match *value {
234                         Object::Reference(id) => doc.get_dictionary(id).ok(),
235                         Object::Dictionary(ref dict) => Some(dict),
236                         _ => None,
237                     };
238                     if !fonts.contains_key(name) {
239                         font.map(|font| fonts.insert(name.clone(), font));
240                     }
241                 }
242             }
243         };
244 
245         let mut fonts = BTreeMap::new();
246         let (resource_dict, resource_ids) = self.get_page_resources(page_id);
247         if let Some(resources) = resource_dict {
248             collect_fonts_from_resources(resources, &mut fonts, self);
249         }
250         for resource_id in resource_ids {
251             if let Ok(resources) = self.get_dictionary(resource_id) {
252                 collect_fonts_from_resources(resources, &mut fonts, self);
253             }
254         }
255         fonts
256     }
257 
decode_text(encoding: Option<&str>, bytes: &[u8]) -> String258     pub fn decode_text(encoding: Option<&str>, bytes: &[u8]) -> String {
259         if let Some(encoding) = encoding {
260             info!("{}", encoding);
261             match encoding {
262                 "StandardEncoding" => bytes_to_string(encodings::STANDARD_ENCODING, bytes),
263                 "MacRomanEncoding" => bytes_to_string(encodings::MAC_ROMAN_ENCODING, bytes),
264                 "MacExpertEncoding" => bytes_to_string(encodings::MAC_EXPERT_ENCODING, bytes),
265                 "WinAnsiEncoding" => bytes_to_string(encodings::WIN_ANSI_ENCODING, bytes),
266                 "UniGB-UCS2-H" | "UniGB−UTF16−H" => UTF_16BE.decode(bytes, DecoderTrap::Ignore).unwrap(),
267                 "Identity-H" => "?Identity-H Unimplemented?".to_string(), // Unimplemented
268                 _ => String::from_utf8_lossy(bytes).to_string(),
269             }
270         } else {
271             bytes_to_string(encodings::STANDARD_ENCODING, bytes)
272         }
273     }
274 
encode_text(encoding: Option<&str>, text: &str) -> Vec<u8>275     pub fn encode_text(encoding: Option<&str>, text: &str) -> Vec<u8> {
276         if let Some(encoding) = encoding {
277             match encoding {
278                 "StandardEncoding" => string_to_bytes(encodings::STANDARD_ENCODING, text),
279                 "MacRomanEncoding" => string_to_bytes(encodings::MAC_ROMAN_ENCODING, text),
280                 "MacExpertEncoding" => string_to_bytes(encodings::MAC_EXPERT_ENCODING, text),
281                 "WinAnsiEncoding" => string_to_bytes(encodings::WIN_ANSI_ENCODING, text),
282                 "UniGB-UCS2-H" | "UniGB−UTF16−H" => UTF_16BE.encode(text, EncoderTrap::Ignore).unwrap(),
283                 "Identity-H" => vec![], // Unimplemented
284                 _ => text.as_bytes().to_vec(),
285             }
286         } else {
287             string_to_bytes(encodings::STANDARD_ENCODING, text)
288         }
289     }
290 }
291 
292 impl Default for Document {
default() -> Self293     fn default() -> Self {
294         Self::new()
295     }
296 }
297 
298 struct PageTreeIter<'a> {
299     doc: &'a Document,
300     stack: Vec<&'a [Object]>,
301     kids: Option<&'a [Object]>,
302     iter_limit: usize,
303 }
304 
305 impl<'a> PageTreeIter<'a> {
306     const PAGE_TREE_DEPTH_LIMIT: usize = 256;
307 
new(doc: &'a Document) -> Self308     fn new(doc: &'a Document) -> Self {
309         if let Ok(page_tree_id) = doc
310             .catalog()
311             .and_then(|cat| cat.get(b"Pages"))
312             .and_then(Object::as_reference)
313         {
314             Self {
315                 doc,
316                 kids: Self::kids(doc, page_tree_id),
317                 stack: Vec::with_capacity(32),
318                 iter_limit: doc.objects.len(),
319             }
320         } else {
321             Self {
322                 doc,
323                 kids: None,
324                 stack: Vec::new(),
325                 iter_limit: doc.objects.len(),
326             }
327         }
328     }
329 
kids(doc: &Document, page_tree_id: ObjectId) -> Option<&[Object]>330     fn kids(doc: &Document, page_tree_id: ObjectId) -> Option<&[Object]> {
331         doc.get_dictionary(page_tree_id)
332             .and_then(|page_tree| page_tree.get(b"Kids"))
333             .and_then(Object::as_array)
334             .map(|k| k.as_slice())
335             .ok()
336     }
337 }
338 
339 impl Iterator for PageTreeIter<'_> {
340     type Item = ObjectId;
341 
next(&mut self) -> Option<Self::Item>342     fn next(&mut self) -> Option<Self::Item> {
343         loop {
344             while let Some((kid, new_kids)) = self.kids.and_then(|k| k.split_first()) {
345                 if self.iter_limit == 0 {
346                     return None;
347                 }
348                 self.iter_limit -= 1;
349 
350                 self.kids = Some(new_kids);
351 
352                 if let Ok(kid_id) = kid.as_reference() {
353                     if let Ok(type_name) = self.doc.get_dictionary(kid_id).and_then(Dictionary::type_name) {
354                         match type_name {
355                             "Page" => {
356                                 return Some(kid_id);
357                             }
358                             "Pages" => {
359                                 if self.stack.len() < Self::PAGE_TREE_DEPTH_LIMIT {
360                                     let kids = self.kids.unwrap();
361                                     if !kids.is_empty() {
362                                         self.stack.push(kids);
363                                     }
364                                     self.kids = Self::kids(self.doc, kid_id);
365                                 }
366                             }
367                             _ => {}
368                         }
369                     }
370                 }
371             }
372 
373             // Current level exhausted, try to pop.
374             if let kids @ Some(_) = self.stack.pop() {
375                 self.kids = kids;
376             } else {
377                 return None;
378             }
379         }
380     }
381 
size_hint(&self) -> (usize, Option<usize>)382     fn size_hint(&self) -> (usize, Option<usize>) {
383         let kids = self.kids.unwrap_or(&[]);
384 
385         let nb_pages: usize = kids
386             .iter()
387             .chain(self.stack.iter().flat_map(|k| k.iter()))
388             .map(|kid| {
389                 if let Ok(dict) = kid.as_reference().and_then(|id| self.doc.get_dictionary(id)) {
390                     if let Ok("Pages") = dict.type_name() {
391                         let count = dict.get_deref(b"Count", self.doc).and_then(Object::as_i64).unwrap_or(0);
392                         // Don't let page count go backwards in case of an invalid document.
393                         max(0, count) as usize
394                     } else {
395                         1
396                     }
397                 } else {
398                     1
399                 }
400             })
401             .sum();
402 
403         (nb_pages, Some(nb_pages))
404     }
405 }
406 
407 impl std::iter::FusedIterator for PageTreeIter<'_> {}
408