1 use super::*;
2 use crate::adapters::spawning::map_exe_error;
3 use crate::preproc::rga_preproc;
4 use lazy_static::lazy_static;
5 use log::*;
6 use std::fs::File;
7 use std::io::BufReader;
8 use std::path::PathBuf;
9 use std::process::Command;
10 
11 static EXTENSIONS: &[&str] = &["pdf"];
12 
13 lazy_static! {
14     static ref METADATA: AdapterMeta = AdapterMeta {
15         name: "pdfpages".to_owned(),
16         version: 1,
17         description: "Converts a pdf to its individual pages as png files. Only useful in combination with tesseract".to_owned(),
18         recurses: true,
19         fast_matchers: EXTENSIONS
20             .iter()
21             .map(|s| FastMatcher::FileExtension(s.to_string()))
22             .collect(),
23         slow_matchers: Some(vec![SlowMatcher::MimeType(
24             "application/pdf".to_owned()
25         )])
26     };
27 }
28 #[derive(Default)]
29 pub struct PdfPagesAdapter {}
30 
31 impl PdfPagesAdapter {
new() -> PdfPagesAdapter32     pub fn new() -> PdfPagesAdapter {
33         PdfPagesAdapter {}
34     }
35 }
36 
37 impl GetMetadata for PdfPagesAdapter {
metadata(&self) -> &AdapterMeta38     fn metadata(&self) -> &AdapterMeta {
39         &METADATA
40     }
41 }
42 
43 /// A pdf is basically converted to a zip that has Page X.png files.
44 /// This way, something like tesseract can process the pages individually
45 impl FileAdapter for PdfPagesAdapter {
adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()>46     fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> {
47         let AdaptInfo {
48             filepath_hint,
49             is_real_file,
50             oup,
51             line_prefix,
52             archive_recursion_depth,
53             config,
54             ..
55         } = ai;
56         if !is_real_file {
57             // todo: read to memory and then use that blob if size < max
58             writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?;
59             return Ok(());
60         }
61         let inp_fname = filepath_hint;
62         let exe_name = "gm";
63         let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?;
64         let out_fname = out_dir.path().join("out%04d.png");
65         debug!("writing to temp dir: {}", out_fname.display());
66         let mut cmd = Command::new(exe_name);
67         cmd.arg("convert")
68             .arg("-density")
69             .arg("200")
70             .arg(inp_fname)
71             .arg("+adjoin")
72             .arg(out_fname);
73 
74         let mut cmd = cmd
75             .spawn()
76             .map_err(|e| map_exe_error(e, exe_name, "Make sure you have imagemagick installed."))?;
77         let args = config.args;
78 
79         let status = cmd.wait()?;
80         if status.success() {
81         } else {
82             return Err(format_err!("subprocess failed: {:?}", status));
83         }
84         for (i, filename) in glob::glob(
85             out_dir
86                 .path()
87                 .join("out*.png")
88                 .to_str()
89                 .expect("temp path has invalid encoding"),
90         )?
91         .enumerate()
92         {
93             let mut ele = BufReader::new(File::open(filename?)?);
94             rga_preproc(AdaptInfo {
95                 filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)),
96                 is_real_file: false,
97                 inp: &mut ele,
98                 oup,
99                 line_prefix: &format!("{}Page {}:", line_prefix, i + 1),
100                 archive_recursion_depth: archive_recursion_depth + 1,
101                 config: PreprocConfig { cache: None, args },
102             })?;
103         }
104         Ok(())
105     }
106 }
107 
108 /*// todo: do this in an actually streaming fashion and less slow
109 // IEND chunk + PDF magic
110 // 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
111 let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a");
112 let split_seq_inx = 8;
113 fn split_by_seq<'a>(
114     split_seq: &'a [u8],
115     split_inx: usize,
116     read: &mut Read,
117 ) -> Fallible<impl IntoIterator<Item = impl Read> + 'a> {
118     let regex = split_seq
119         .iter()
120         .map(|c| format!("\\x{:0>2x}", c))
121         .collect::<Vec<_>>()
122         .join("");
123     let restr = format!("(?-u){}", regex);
124     eprintln!("re: {}", restr);
125     let re = regex::bytes::Regex::new(&restr)?;
126 
127     let mut all = Vec::new();
128     read.read_to_end(&mut all)?;
129     let mut out: Vec<Cursor<Vec<u8>>> = Vec::new();
130     let mut last = 0;
131     for (i, split) in re.find_iter(&all).enumerate() {
132         let pos = split.start() + split_inx;
133         out.push(Cursor::new(Vec::from(&all[last..pos])));
134         last = pos;
135     }
136     out.push(Cursor::new(Vec::from(&all[last..])));
137     Ok(out)
138 }*/
139