1 /* Copyright (C) 2005 J.F.Dockes
2  *   This program is free software; you can redistribute it and/or modify
3  *   it under the terms of the GNU General Public License as published by
4  *   the Free Software Foundation; either version 2 of the License, or
5  *   (at your option) any later version.
6  *
7  *   This program is distributed in the hope that it will be useful,
8  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  *   GNU General Public License for more details.
11  *
12  *   You should have received a copy of the GNU General Public License
13  *   along with this program; if not, write to the
14  *   Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16  */
17 #include "autoconfig.h"
18 
19 #include <stdio.h>
20 #include <sys/types.h>
21 #include "safesyswait.h"
22 #include <iostream>
23 #include <sstream>
24 
25 #include "cstr.h"
26 #include "mh_execm.h"
27 #include "mh_html.h"
28 #include "log.h"
29 #include "cancelcheck.h"
30 #include "smallut.h"
31 #include "md5ut.h"
32 #include "rclconfig.h"
33 #include "mimetype.h"
34 #include "idfile.h"
35 #include "rclutil.h"
36 #include "idxdiags.h"
37 
38 using namespace std;
39 
startCmd()40 bool MimeHandlerExecMultiple::startCmd()
41 {
42     LOGDEB("MimeHandlerExecMultiple::startCmd\n");
43     if (params.empty()) {
44         // Hu ho
45         LOGERR("MHExecMultiple::startCmd: empty params\n");
46         m_reason = "RECFILTERROR BADCONFIG";
47         return false;
48     }
49 
50     // Command name
51     string cmd = params.front();
52 
53     m_maxmemberkb = 50000;
54     m_config->getConfParam("membermaxkbs", &m_maxmemberkb);
55     ostringstream oss;
56     oss << "RECOLL_FILTER_MAXMEMBERKB=" << m_maxmemberkb;
57     m_cmd.putenv(oss.str());
58 
59     m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir());
60     m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
61                  "RECOLL_FILTER_FORPREVIEW=no");
62 
63     m_cmd.setrlimit_as(m_filtermaxmbytes);
64     m_adv.setmaxsecs(m_filtermaxseconds);
65     m_cmd.setAdvise(&m_adv);
66     std::string errfile;
67     m_config->getConfParam("helperlogfilename", errfile);
68     if (!errfile.empty()) {
69         m_cmd.setStderr(errfile);
70     }
71 
72     // Build parameter list: delete cmd name
73     vector<string>myparams(params.begin() + 1, params.end());
74 
75     if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) {
76         IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn);
77         m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd;
78         missingHelper = true;
79         whatHelper = cmd;
80         return false;
81     }
82     return true;
83 }
84 
85 // Note: data is not used if this is the "document:" field: it goes
86 // directly to m_metaData[cstr_dj_keycontent] to avoid an extra copy
87 //
88 // Messages are made of data elements. Each element is like:
89 // name: len\ndata
90 // An empty line signals the end of the message, so the whole thing
91 // would look like:
92 // Name1: Len1\nData1Name2: Len2\nData2\n
readDataElement(string & name,string & data)93 bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
94 {
95     string ibuf;
96 
97     // Read name and length
98     if (m_cmd.getline(ibuf) <= 0) {
99         LOGERR("MHExecMultiple: getline error\n");
100         return false;
101     }
102 
103     LOGDEB1("MHEM:rde: line [" << ibuf << "]\n");
104 
105     // Empty line (end of message) ?
106     if (!ibuf.compare("\n")) {
107         LOGDEB1("MHExecMultiple: Got empty line\n");
108         name.clear();
109         return true;
110     }
111 
112     // Filters will sometimes abort before entering the real protocol, ie if
113     // a module can't be loaded. Check the special filter error first word:
114     std::string::size_type pos;
115     if ((pos = ibuf.find("RECFILTERROR ")) == 0) {
116         m_reason = ibuf;
117         if (ibuf.find("HELPERNOTFOUND") != string::npos) {
118             IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn);
119             missingHelper = true;
120             whatHelper = ibuf.substr(pos);
121         }
122         return false;
123     }
124 
125     // We're expecting something like Name: len\n
126     vector<string> tokens;
127     stringToTokens(ibuf, tokens);
128     if (tokens.size() != 2) {
129         LOGERR("MHExecMultiple: bad line in filter output: [" << ibuf << "]\n");
130         return false;
131     }
132     vector<string>::iterator it = tokens.begin();
133     name = *it++;
134     string& slen = *it;
135     int len;
136     if (sscanf(slen.c_str(), "%d", &len) != 1) {
137         LOGERR("MHExecMultiple: bad line in filter output: [" << ibuf << "]\n");
138         return false;
139     }
140 
141     if (len / 1024 > m_maxmemberkb) {
142         LOGERR("MHExecMultiple: data len > maxmemberkb\n");
143         return false;
144     }
145 
146     // Hack: check for 'Document:' and read directly the document data
147     // to m_metaData[cstr_dj_keycontent] to avoid an extra copy of the bulky
148     // piece
149     string *datap = &data;
150     if (!stringlowercmp("document:", name)) {
151         datap = &m_metaData[cstr_dj_keycontent];
152     } else {
153         datap = &data;
154     }
155 
156     // Read element data
157     datap->erase();
158     if (len > 0 && m_cmd.receive(*datap, len) != len) {
159         LOGERR("MHExecMultiple: expected " << len << " bytes of data, got " <<
160                datap->length() << "\n");
161         return false;
162     }
163     LOGDEB1("MHExecMe:rdDtElt got: name [" << name << "] len " << len <<
164             "value [" << (datap->size() > 100 ?
165                           (datap->substr(0, 100) + " ...") : *datap) << endl);
166     return true;
167 }
168 
next_document()169 bool MimeHandlerExecMultiple::next_document()
170 {
171     LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n");
172     if (m_havedoc == false)
173         return false;
174 
175     if (missingHelper) {
176         LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
177         m_reason = whatHelper;
178         return false;
179     }
180 
181     if (m_cmd.getChildPid() <= 0 && !startCmd()) {
182         return false;
183     }
184 
185     m_metaData.clear();
186 
187     // Send request to child process. This maybe the first/only
188     // request for a given file, or a continuation request. We send an
189     // empty file name in the latter case.
190     // We also compute the file md5 before starting the extraction:
191     // under Windows, we may not be able to do it while the file
192     // is opened by the filter.
193     ostringstream obuf;
194     string file_md5;
195     if (m_filefirst) {
196         if (!m_forPreview && !m_nomd5) {
197             string md5, xmd5, reason;
198             if (MD5File(m_fn, md5, &reason)) {
199                 file_md5 = MD5HexPrint(md5, xmd5);
200             } else {
201                 LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
202                        "]: " << reason << "\n");
203             }
204         }
205         obuf << "filename: " << m_fn.length() << "\n" << m_fn;
206         // m_filefirst is set to true by set_document_file()
207         m_filefirst = false;
208     } else {
209         obuf << "filename: " << 0 << "\n";
210     }
211     if (!m_ipath.empty()) {
212         LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
213                m_ipath << "]\n");
214         obuf << "ipath: " << m_ipath.length() << "\n" << m_ipath;
215     }
216     if (!m_dfltInputCharset.empty()) {
217         obuf << "dflincs: " << m_dfltInputCharset.length() << "\n"
218              << m_dfltInputCharset;
219     }
220     obuf << "mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
221     obuf << "\n";
222     if (m_cmd.send(obuf.str()) < 0) {
223         m_cmd.zapChild();
224         LOGERR("MHExecMultiple: send error\n");
225         return false;
226     }
227 
228     m_adv.reset();
229 
230     // Read answer (multiple elements)
231     LOGDEB1("MHExecMultiple: reading answer\n");
232     bool eofnext_received = false;
233     bool eofnow_received = false;
234     bool fileerror_received = false;
235     bool subdocerror_received = false;
236     string ipath;
237     string mtype;
238     string charset;
239     for (int loop=0;;loop++) {
240         string name, data;
241         try {
242             if (!readDataElement(name, data)) {
243                 m_cmd.zapChild();
244                 return false;
245             }
246         } catch (HandlerTimeout) {
247             LOGINFO("MHExecMultiple: timeout\n");
248             m_cmd.zapChild();
249             return false;
250         } catch (CancelExcept) {
251             LOGINFO("MHExecMultiple: interrupt\n");
252             m_cmd.zapChild();
253             return false;
254         }
255         if (name.empty())
256             break;
257         if (!stringlowercmp("eofnext:", name)) {
258             LOGDEB("MHExecMultiple: got EOFNEXT\n");
259             eofnext_received = true;
260         } else if (!stringlowercmp("eofnow:", name)) {
261             LOGDEB("MHExecMultiple: got EOFNOW\n");
262             eofnow_received = true;
263         } else if (!stringlowercmp("fileerror:", name)) {
264             LOGDEB("MHExecMultiple: got FILEERROR\n");
265             fileerror_received = true;
266         } else if (!stringlowercmp("subdocerror:", name)) {
267             LOGDEB("MHExecMultiple: got SUBDOCERROR\n");
268             subdocerror_received = true;
269         } else if (!stringlowercmp("ipath:", name)) {
270             ipath = data;
271             LOGDEB("MHExecMultiple: got ipath [" << data << "]\n");
272         } else if (!stringlowercmp("charset:", name)) {
273             charset = data;
274             LOGDEB("MHExecMultiple: got charset [" << data << "]\n");
275         } else if (!stringlowercmp("mimetype:", name)) {
276             mtype = data;
277             LOGDEB("MHExecMultiple: got mimetype [" << data << "]\n");
278         } else {
279             string nm = stringtolower((const string&)name);
280             trimstring(nm, ":");
281             LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n");
282             addmeta(m_metaData, nm, data);
283         }
284         if (loop == 200) {
285             // ??
286             LOGERR("MHExecMultiple: handler sent more than 200 attributes\n");
287             return false;
288         }
289     }
290 
291     if (eofnow_received || fileerror_received) {
292         // No more docs
293         m_havedoc = false;
294         return false;
295     }
296     if (subdocerror_received) {
297         return false;
298     }
299 
300     // It used to be that eof could be signalled just by an empty document, but
301     // this was wrong. Empty documents can be found ie in zip files and should
302     // not be interpreted as eof.
303     if (m_metaData[cstr_dj_keycontent].empty()) {
304         LOGDEB0("MHExecMultiple: got empty document inside [" << m_fn <<
305                 "]: [" << ipath << "]\n");
306     }
307 
308     if (!ipath.empty()) {
309         // If this has an ipath, it is an internal doc from a
310         // multi-document file. In this case, either the filter
311         // supplies the mimetype, or the ipath MUST be a filename-like
312         // string which we can use to compute a mime type
313         m_metaData[cstr_dj_keyipath] = ipath;
314         if (mtype.empty()) {
315             LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
316                     "for a guess\n");
317             mtype = mimetype(ipath, 0, m_config, false);
318             if (mtype.empty()) {
319                 // mimetype() won't call idFile when there is no file. Do it
320                 mtype = idFileMem(m_metaData[cstr_dj_keycontent]);
321                 if (mtype.empty()) {
322                     // Note this happens for example for directory zip members
323                     // We could recognize them by the end /, but wouldn't know
324                     // what to do with them anyway.
325                     LOGINFO("MHExecMultiple: cant guess mime type\n");
326                     mtype = "application/octet-stream";
327                 }
328             }
329             /* If we identify text/plain from the suffix (as opposed
330                to the handler setting the type), we use text/plain1
331                instead. As directed in mimeconf, this will cause the
332                text handler to be applied (instead of internfile just
333                ending things there), allowing splitting and default
334                charset conversions. */
335             if (mtype == "text/plain") {
336                 mtype = "text/plain1";
337             }
338         }
339         m_metaData[cstr_dj_keymt] = mtype;
340         if (!m_forPreview) {
341             string md5, xmd5;
342             MD5String(m_metaData[cstr_dj_keycontent], md5);
343             m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
344         }
345     } else {
346         // "Self" document.
347         m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
348         m_metaData.erase(cstr_dj_keyipath);
349         if (!m_forPreview) {
350             m_metaData[cstr_dj_keymd5] = file_md5;
351         }
352     }
353 
354     handle_cs(m_metaData[cstr_dj_keymt], charset);
355 
356     if (eofnext_received)
357         m_havedoc = false;
358 
359     LOGDEB0("MHExecMultiple: returning " <<
360             m_metaData[cstr_dj_keycontent].size() <<
361             " bytes of content, mtype [" << m_metaData[cstr_dj_keymt] <<
362             "] charset [" << m_metaData[cstr_dj_keycharset] << "]\n");
363     LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString());
364     return true;
365 }
366