1 use super::encodings::{self, bytes_to_string, string_to_bytes}; 2 use super::{Dictionary, Object, ObjectId}; 3 use crate::xref::Xref; 4 use crate::{Error, Result}; 5 use encoding::all::UTF_16BE; 6 use encoding::types::{DecoderTrap, EncoderTrap, Encoding}; 7 use log::info; 8 use std::cmp::max; 9 use std::collections::BTreeMap; 10 use std::io::Write; 11 use std::str; 12 13 /// PDF document. 14 #[derive(Debug, Clone)] 15 pub struct Document { 16 /// The version of the PDF specification to which the file conforms. 17 pub version: String, 18 19 /// The trailer gives the location of the cross-reference table and of certain special objects. 20 pub trailer: Dictionary, 21 22 /// The cross-reference table contains locations of the indirect objects. 23 pub reference_table: Xref, 24 25 /// The objects that make up the document contained in the file. 26 pub objects: BTreeMap<ObjectId, Object>, 27 28 /// Current maximum object id within the document. 29 pub max_id: u32, 30 } 31 32 impl Document { 33 /// Create new PDF document. new() -> Document34 pub fn new() -> Document { 35 Document { 36 version: "1.4".to_string(), 37 trailer: Dictionary::new(), 38 reference_table: Xref::new(0), 39 objects: BTreeMap::new(), 40 max_id: 0, 41 } 42 } 43 44 const DEREF_LIMIT: usize = 128; 45 46 /// Follow references if the supplied object is a reference. 47 /// 48 /// Returns a tuple of an optional object id and final object. 49 /// The object id will be None if the object was not a 50 /// reference. Otherwise, it will be the last object id in the 51 /// reference chain. dereference<'a>(&'a self, mut object: &'a Object) -> Result<(Option<ObjectId>, &'a Object)>52 pub fn dereference<'a>(&'a self, mut object: &'a Object) -> Result<(Option<ObjectId>, &'a Object)> { 53 let mut nb_deref = 0; 54 let mut id = None; 55 56 while let Ok(ref_id) = object.as_reference() { 57 id = Some(ref_id); 58 object = self.objects.get(&ref_id).ok_or(Error::ObjectNotFound)?; 59 60 nb_deref += 1; 61 if nb_deref > Self::DEREF_LIMIT { 62 return Err(Error::ReferenceLimit); 63 } 64 } 65 66 Ok((id, object)) 67 } 68 69 /// Get object by object id, will iteratively dereference a referenced object. get_object(&self, id: ObjectId) -> Result<&Object>70 pub fn get_object(&self, id: ObjectId) -> Result<&Object> { 71 let object = self.objects.get(&id).ok_or(Error::ObjectNotFound)?; 72 self.dereference(object).map(|(_, object)| object) 73 } 74 75 /// Get mutable reference to object by object id, will iteratively dereference a referenced object. get_object_mut(&mut self, id: ObjectId) -> Result<&mut Object>76 pub fn get_object_mut(&mut self, id: ObjectId) -> Result<&mut Object> { 77 let object = self.objects.get(&id).ok_or(Error::ObjectNotFound)?; 78 let (ref_id, _) = self.dereference(object)?; 79 80 Ok(self.objects.get_mut(&ref_id.unwrap_or(id)).unwrap()) 81 } 82 83 /// Get page object_id of the specified object object_id get_object_page(&self, id: ObjectId) -> Result<ObjectId>84 pub fn get_object_page(&self, id: ObjectId) -> Result<ObjectId> { 85 for (_, object_id) in self.get_pages() { 86 let page = self.get_object(object_id)?.as_dict()?; 87 let annots = page.get(b"Annots")?.as_array()?; 88 let objects_ids = annots.iter().map(|object| object.as_reference()).collect::<Vec<_>>(); 89 90 if objects_ids.iter().any(|object_id| { 91 if let Ok(object_id) = object_id { 92 return id == *object_id; 93 } 94 95 false 96 }) { 97 return Ok(object_id); 98 } 99 } 100 101 Err(Error::ObjectNotFound) 102 } 103 104 /// Get dictionary object by id. get_dictionary(&self, id: ObjectId) -> Result<&Dictionary>105 pub fn get_dictionary(&self, id: ObjectId) -> Result<&Dictionary> { 106 self.get_object(id).and_then(Object::as_dict) 107 } 108 109 /// Traverse objects from trailer recursively, return all referenced object IDs. traverse_objects<A: Fn(&mut Object) -> ()>(&mut self, action: A) -> Vec<ObjectId>110 pub fn traverse_objects<A: Fn(&mut Object) -> ()>(&mut self, action: A) -> Vec<ObjectId> { 111 fn traverse_array<A: Fn(&mut Object) -> ()>(array: &mut Vec<Object>, action: &A, refs: &mut Vec<ObjectId>) { 112 for item in array.iter_mut() { 113 traverse_object(item, action, refs); 114 } 115 } 116 fn traverse_dictionary<A: Fn(&mut Object) -> ()>(dict: &mut Dictionary, action: &A, refs: &mut Vec<ObjectId>) { 117 for (_, v) in dict.iter_mut() { 118 traverse_object(v, action, refs); 119 } 120 } 121 fn traverse_object<A: Fn(&mut Object) -> ()>(object: &mut Object, action: &A, refs: &mut Vec<ObjectId>) { 122 action(object); 123 match *object { 124 Object::Array(ref mut array) => traverse_array(array, action, refs), 125 Object::Dictionary(ref mut dict) => traverse_dictionary(dict, action, refs), 126 Object::Stream(ref mut stream) => traverse_dictionary(&mut stream.dict, action, refs), 127 Object::Reference(id) => { 128 if !refs.contains(&id) { 129 refs.push(id); 130 } 131 } 132 _ => {} 133 } 134 } 135 let mut refs = vec![]; 136 traverse_dictionary(&mut self.trailer, &action, &mut refs); 137 let mut index = 0; 138 while index < refs.len() { 139 if let Some(object) = self.objects.get_mut(&refs[index]) { 140 traverse_object(object, &action, &mut refs); 141 } 142 index += 1; 143 } 144 refs 145 } 146 147 /// Get catalog dictionary. catalog(&self) -> Result<&Dictionary>148 pub fn catalog(&self) -> Result<&Dictionary> { 149 self.trailer 150 .get(b"Root") 151 .and_then(Object::as_reference) 152 .and_then(|id| self.get_dictionary(id)) 153 } 154 155 /// Get page numbers and corresponding object ids. get_pages(&self) -> BTreeMap<u32, ObjectId>156 pub fn get_pages(&self) -> BTreeMap<u32, ObjectId> { 157 self.page_iter().enumerate().map(|(i, p)| ((i + 1) as u32, p)).collect() 158 } 159 page_iter(&self) -> impl Iterator<Item = ObjectId> + '_160 pub fn page_iter(&self) -> impl Iterator<Item = ObjectId> + '_ { 161 PageTreeIter::new(self) 162 } 163 164 /// Get content stream object ids of a page. get_page_contents(&self, page_id: ObjectId) -> Vec<ObjectId>165 pub fn get_page_contents(&self, page_id: ObjectId) -> Vec<ObjectId> { 166 let mut streams = vec![]; 167 if let Ok(page) = self.get_dictionary(page_id) { 168 if let Ok(contents) = page.get(b"Contents") { 169 match *contents { 170 Object::Reference(ref id) => { 171 streams.push(*id); 172 } 173 Object::Array(ref arr) => { 174 for content in arr { 175 if let Ok(id) = content.as_reference() { 176 streams.push(id) 177 } 178 } 179 } 180 _ => {} 181 } 182 } 183 } 184 streams 185 } 186 187 /// Get content of a page. get_page_content(&self, page_id: ObjectId) -> Result<Vec<u8>>188 pub fn get_page_content(&self, page_id: ObjectId) -> Result<Vec<u8>> { 189 let mut content = Vec::new(); 190 let content_streams = self.get_page_contents(page_id); 191 for object_id in content_streams { 192 if let Ok(content_stream) = self.get_object(object_id).and_then(Object::as_stream) { 193 match content_stream.decompressed_content() { 194 Ok(data) => content.write_all(&data)?, 195 Err(_) => content.write_all(&content_stream.content)?, 196 }; 197 } 198 } 199 Ok(content) 200 } 201 202 /// Get resources used by a page. get_page_resources(&self, page_id: ObjectId) -> (Option<&Dictionary>, Vec<ObjectId>)203 pub fn get_page_resources(&self, page_id: ObjectId) -> (Option<&Dictionary>, Vec<ObjectId>) { 204 fn collect_resources(page_node: &Dictionary, resource_ids: &mut Vec<ObjectId>, doc: &Document) { 205 if let Ok(resources_id) = page_node.get(b"Resources").and_then(Object::as_reference) { 206 resource_ids.push(resources_id); 207 } 208 if let Ok(page_tree) = page_node 209 .get(b"Parent") 210 .and_then(Object::as_reference) 211 .and_then(|id| doc.get_dictionary(id)) 212 { 213 collect_resources(page_tree, resource_ids, doc); 214 } 215 }; 216 217 let mut resource_dict = None; 218 let mut resource_ids = Vec::new(); 219 if let Ok(page) = self.get_dictionary(page_id) { 220 resource_dict = page.get(b"Resources").and_then(Object::as_dict).ok(); 221 collect_resources(page, &mut resource_ids, self); 222 } 223 (resource_dict, resource_ids) 224 } 225 226 /// Get fonts used by a page. get_page_fonts(&self, page_id: ObjectId) -> BTreeMap<Vec<u8>, &Dictionary>227 pub fn get_page_fonts(&self, page_id: ObjectId) -> BTreeMap<Vec<u8>, &Dictionary> { 228 fn collect_fonts_from_resources<'a>( 229 resources: &'a Dictionary, fonts: &mut BTreeMap<Vec<u8>, &'a Dictionary>, doc: &'a Document, 230 ) { 231 if let Ok(font_dict) = resources.get(b"Font").and_then(Object::as_dict) { 232 for (name, value) in font_dict.iter() { 233 let font = match *value { 234 Object::Reference(id) => doc.get_dictionary(id).ok(), 235 Object::Dictionary(ref dict) => Some(dict), 236 _ => None, 237 }; 238 if !fonts.contains_key(name) { 239 font.map(|font| fonts.insert(name.clone(), font)); 240 } 241 } 242 } 243 }; 244 245 let mut fonts = BTreeMap::new(); 246 let (resource_dict, resource_ids) = self.get_page_resources(page_id); 247 if let Some(resources) = resource_dict { 248 collect_fonts_from_resources(resources, &mut fonts, self); 249 } 250 for resource_id in resource_ids { 251 if let Ok(resources) = self.get_dictionary(resource_id) { 252 collect_fonts_from_resources(resources, &mut fonts, self); 253 } 254 } 255 fonts 256 } 257 decode_text(encoding: Option<&str>, bytes: &[u8]) -> String258 pub fn decode_text(encoding: Option<&str>, bytes: &[u8]) -> String { 259 if let Some(encoding) = encoding { 260 info!("{}", encoding); 261 match encoding { 262 "StandardEncoding" => bytes_to_string(encodings::STANDARD_ENCODING, bytes), 263 "MacRomanEncoding" => bytes_to_string(encodings::MAC_ROMAN_ENCODING, bytes), 264 "MacExpertEncoding" => bytes_to_string(encodings::MAC_EXPERT_ENCODING, bytes), 265 "WinAnsiEncoding" => bytes_to_string(encodings::WIN_ANSI_ENCODING, bytes), 266 "UniGB-UCS2-H" | "UniGB−UTF16−H" => UTF_16BE.decode(bytes, DecoderTrap::Ignore).unwrap(), 267 "Identity-H" => "?Identity-H Unimplemented?".to_string(), // Unimplemented 268 _ => String::from_utf8_lossy(bytes).to_string(), 269 } 270 } else { 271 bytes_to_string(encodings::STANDARD_ENCODING, bytes) 272 } 273 } 274 encode_text(encoding: Option<&str>, text: &str) -> Vec<u8>275 pub fn encode_text(encoding: Option<&str>, text: &str) -> Vec<u8> { 276 if let Some(encoding) = encoding { 277 match encoding { 278 "StandardEncoding" => string_to_bytes(encodings::STANDARD_ENCODING, text), 279 "MacRomanEncoding" => string_to_bytes(encodings::MAC_ROMAN_ENCODING, text), 280 "MacExpertEncoding" => string_to_bytes(encodings::MAC_EXPERT_ENCODING, text), 281 "WinAnsiEncoding" => string_to_bytes(encodings::WIN_ANSI_ENCODING, text), 282 "UniGB-UCS2-H" | "UniGB−UTF16−H" => UTF_16BE.encode(text, EncoderTrap::Ignore).unwrap(), 283 "Identity-H" => vec![], // Unimplemented 284 _ => text.as_bytes().to_vec(), 285 } 286 } else { 287 string_to_bytes(encodings::STANDARD_ENCODING, text) 288 } 289 } 290 } 291 292 impl Default for Document { default() -> Self293 fn default() -> Self { 294 Self::new() 295 } 296 } 297 298 struct PageTreeIter<'a> { 299 doc: &'a Document, 300 stack: Vec<&'a [Object]>, 301 kids: Option<&'a [Object]>, 302 iter_limit: usize, 303 } 304 305 impl<'a> PageTreeIter<'a> { 306 const PAGE_TREE_DEPTH_LIMIT: usize = 256; 307 new(doc: &'a Document) -> Self308 fn new(doc: &'a Document) -> Self { 309 if let Ok(page_tree_id) = doc 310 .catalog() 311 .and_then(|cat| cat.get(b"Pages")) 312 .and_then(Object::as_reference) 313 { 314 Self { 315 doc, 316 kids: Self::kids(doc, page_tree_id), 317 stack: Vec::with_capacity(32), 318 iter_limit: doc.objects.len(), 319 } 320 } else { 321 Self { 322 doc, 323 kids: None, 324 stack: Vec::new(), 325 iter_limit: doc.objects.len(), 326 } 327 } 328 } 329 kids(doc: &Document, page_tree_id: ObjectId) -> Option<&[Object]>330 fn kids(doc: &Document, page_tree_id: ObjectId) -> Option<&[Object]> { 331 doc.get_dictionary(page_tree_id) 332 .and_then(|page_tree| page_tree.get(b"Kids")) 333 .and_then(Object::as_array) 334 .map(|k| k.as_slice()) 335 .ok() 336 } 337 } 338 339 impl Iterator for PageTreeIter<'_> { 340 type Item = ObjectId; 341 next(&mut self) -> Option<Self::Item>342 fn next(&mut self) -> Option<Self::Item> { 343 loop { 344 while let Some((kid, new_kids)) = self.kids.and_then(|k| k.split_first()) { 345 if self.iter_limit == 0 { 346 return None; 347 } 348 self.iter_limit -= 1; 349 350 self.kids = Some(new_kids); 351 352 if let Ok(kid_id) = kid.as_reference() { 353 if let Ok(type_name) = self.doc.get_dictionary(kid_id).and_then(Dictionary::type_name) { 354 match type_name { 355 "Page" => { 356 return Some(kid_id); 357 } 358 "Pages" => { 359 if self.stack.len() < Self::PAGE_TREE_DEPTH_LIMIT { 360 let kids = self.kids.unwrap(); 361 if !kids.is_empty() { 362 self.stack.push(kids); 363 } 364 self.kids = Self::kids(self.doc, kid_id); 365 } 366 } 367 _ => {} 368 } 369 } 370 } 371 } 372 373 // Current level exhausted, try to pop. 374 if let kids @ Some(_) = self.stack.pop() { 375 self.kids = kids; 376 } else { 377 return None; 378 } 379 } 380 } 381 size_hint(&self) -> (usize, Option<usize>)382 fn size_hint(&self) -> (usize, Option<usize>) { 383 let kids = self.kids.unwrap_or(&[]); 384 385 let nb_pages: usize = kids 386 .iter() 387 .chain(self.stack.iter().flat_map(|k| k.iter())) 388 .map(|kid| { 389 if let Ok(dict) = kid.as_reference().and_then(|id| self.doc.get_dictionary(id)) { 390 if let Ok("Pages") = dict.type_name() { 391 let count = dict.get_deref(b"Count", self.doc).and_then(Object::as_i64).unwrap_or(0); 392 // Don't let page count go backwards in case of an invalid document. 393 max(0, count) as usize 394 } else { 395 1 396 } 397 } else { 398 1 399 } 400 }) 401 .sum(); 402 403 (nb_pages, Some(nb_pages)) 404 } 405 } 406 407 impl std::iter::FusedIterator for PageTreeIter<'_> {} 408