1 use super::*; 2 use crate::adapters::spawning::map_exe_error; 3 use crate::preproc::rga_preproc; 4 use lazy_static::lazy_static; 5 use log::*; 6 use std::fs::File; 7 use std::io::BufReader; 8 use std::path::PathBuf; 9 use std::process::Command; 10 11 static EXTENSIONS: &[&str] = &["pdf"]; 12 13 lazy_static! { 14 static ref METADATA: AdapterMeta = AdapterMeta { 15 name: "pdfpages".to_owned(), 16 version: 1, 17 description: "Converts a pdf to its individual pages as png files. Only useful in combination with tesseract".to_owned(), 18 recurses: true, 19 fast_matchers: EXTENSIONS 20 .iter() 21 .map(|s| FastMatcher::FileExtension(s.to_string())) 22 .collect(), 23 slow_matchers: Some(vec![SlowMatcher::MimeType( 24 "application/pdf".to_owned() 25 )]) 26 }; 27 } 28 #[derive(Default)] 29 pub struct PdfPagesAdapter {} 30 31 impl PdfPagesAdapter { new() -> PdfPagesAdapter32 pub fn new() -> PdfPagesAdapter { 33 PdfPagesAdapter {} 34 } 35 } 36 37 impl GetMetadata for PdfPagesAdapter { metadata(&self) -> &AdapterMeta38 fn metadata(&self) -> &AdapterMeta { 39 &METADATA 40 } 41 } 42 43 /// A pdf is basically converted to a zip that has Page X.png files. 44 /// This way, something like tesseract can process the pages individually 45 impl FileAdapter for PdfPagesAdapter { adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()>46 fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> { 47 let AdaptInfo { 48 filepath_hint, 49 is_real_file, 50 oup, 51 line_prefix, 52 archive_recursion_depth, 53 config, 54 .. 55 } = ai; 56 if !is_real_file { 57 // todo: read to memory and then use that blob if size < max 58 writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?; 59 return Ok(()); 60 } 61 let inp_fname = filepath_hint; 62 let exe_name = "gm"; 63 let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?; 64 let out_fname = out_dir.path().join("out%04d.png"); 65 debug!("writing to temp dir: {}", out_fname.display()); 66 let mut cmd = Command::new(exe_name); 67 cmd.arg("convert") 68 .arg("-density") 69 .arg("200") 70 .arg(inp_fname) 71 .arg("+adjoin") 72 .arg(out_fname); 73 74 let mut cmd = cmd 75 .spawn() 76 .map_err(|e| map_exe_error(e, exe_name, "Make sure you have imagemagick installed."))?; 77 let args = config.args; 78 79 let status = cmd.wait()?; 80 if status.success() { 81 } else { 82 return Err(format_err!("subprocess failed: {:?}", status)); 83 } 84 for (i, filename) in glob::glob( 85 out_dir 86 .path() 87 .join("out*.png") 88 .to_str() 89 .expect("temp path has invalid encoding"), 90 )? 91 .enumerate() 92 { 93 let mut ele = BufReader::new(File::open(filename?)?); 94 rga_preproc(AdaptInfo { 95 filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)), 96 is_real_file: false, 97 inp: &mut ele, 98 oup, 99 line_prefix: &format!("{}Page {}:", line_prefix, i + 1), 100 archive_recursion_depth: archive_recursion_depth + 1, 101 config: PreprocConfig { cache: None, args }, 102 })?; 103 } 104 Ok(()) 105 } 106 } 107 108 /*// todo: do this in an actually streaming fashion and less slow 109 // IEND chunk + PDF magic 110 // 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a 111 let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a"); 112 let split_seq_inx = 8; 113 fn split_by_seq<'a>( 114 split_seq: &'a [u8], 115 split_inx: usize, 116 read: &mut Read, 117 ) -> Fallible<impl IntoIterator<Item = impl Read> + 'a> { 118 let regex = split_seq 119 .iter() 120 .map(|c| format!("\\x{:0>2x}", c)) 121 .collect::<Vec<_>>() 122 .join(""); 123 let restr = format!("(?-u){}", regex); 124 eprintln!("re: {}", restr); 125 let re = regex::bytes::Regex::new(&restr)?; 126 127 let mut all = Vec::new(); 128 read.read_to_end(&mut all)?; 129 let mut out: Vec<Cursor<Vec<u8>>> = Vec::new(); 130 let mut last = 0; 131 for (i, split) in re.find_iter(&all).enumerate() { 132 let pos = split.start() + split_inx; 133 out.push(Cursor::new(Vec::from(&all[last..pos]))); 134 last = pos; 135 } 136 out.push(Cursor::new(Vec::from(&all[last..]))); 137 Ok(out) 138 }*/ 139