1 /* Copyright (C) 2005 J.F.Dockes
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the
14 * Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16 */
17 #include "autoconfig.h"
18
19 #include <stdio.h>
20 #include <sys/types.h>
21 #include "safesyswait.h"
22 #include <iostream>
23 #include <sstream>
24
25 #include "cstr.h"
26 #include "mh_execm.h"
27 #include "mh_html.h"
28 #include "log.h"
29 #include "cancelcheck.h"
30 #include "smallut.h"
31 #include "md5ut.h"
32 #include "rclconfig.h"
33 #include "mimetype.h"
34 #include "idfile.h"
35 #include "rclutil.h"
36 #include "idxdiags.h"
37
38 using namespace std;
39
startCmd()40 bool MimeHandlerExecMultiple::startCmd()
41 {
42 LOGDEB("MimeHandlerExecMultiple::startCmd\n");
43 if (params.empty()) {
44 // Hu ho
45 LOGERR("MHExecMultiple::startCmd: empty params\n");
46 m_reason = "RECFILTERROR BADCONFIG";
47 return false;
48 }
49
50 // Command name
51 string cmd = params.front();
52
53 m_maxmemberkb = 50000;
54 m_config->getConfParam("membermaxkbs", &m_maxmemberkb);
55 ostringstream oss;
56 oss << "RECOLL_FILTER_MAXMEMBERKB=" << m_maxmemberkb;
57 m_cmd.putenv(oss.str());
58
59 m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir());
60 m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
61 "RECOLL_FILTER_FORPREVIEW=no");
62
63 m_cmd.setrlimit_as(m_filtermaxmbytes);
64 m_adv.setmaxsecs(m_filtermaxseconds);
65 m_cmd.setAdvise(&m_adv);
66 std::string errfile;
67 m_config->getConfParam("helperlogfilename", errfile);
68 if (!errfile.empty()) {
69 m_cmd.setStderr(errfile);
70 }
71
72 // Build parameter list: delete cmd name
73 vector<string>myparams(params.begin() + 1, params.end());
74
75 if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) {
76 IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn);
77 m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd;
78 missingHelper = true;
79 whatHelper = cmd;
80 return false;
81 }
82 return true;
83 }
84
85 // Note: data is not used if this is the "document:" field: it goes
86 // directly to m_metaData[cstr_dj_keycontent] to avoid an extra copy
87 //
88 // Messages are made of data elements. Each element is like:
89 // name: len\ndata
90 // An empty line signals the end of the message, so the whole thing
91 // would look like:
92 // Name1: Len1\nData1Name2: Len2\nData2\n
readDataElement(string & name,string & data)93 bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
94 {
95 string ibuf;
96
97 // Read name and length
98 if (m_cmd.getline(ibuf) <= 0) {
99 LOGERR("MHExecMultiple: getline error\n");
100 return false;
101 }
102
103 LOGDEB1("MHEM:rde: line [" << ibuf << "]\n");
104
105 // Empty line (end of message) ?
106 if (!ibuf.compare("\n")) {
107 LOGDEB1("MHExecMultiple: Got empty line\n");
108 name.clear();
109 return true;
110 }
111
112 // Filters will sometimes abort before entering the real protocol, ie if
113 // a module can't be loaded. Check the special filter error first word:
114 std::string::size_type pos;
115 if ((pos = ibuf.find("RECFILTERROR ")) == 0) {
116 m_reason = ibuf;
117 if (ibuf.find("HELPERNOTFOUND") != string::npos) {
118 IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn);
119 missingHelper = true;
120 whatHelper = ibuf.substr(pos);
121 }
122 return false;
123 }
124
125 // We're expecting something like Name: len\n
126 vector<string> tokens;
127 stringToTokens(ibuf, tokens);
128 if (tokens.size() != 2) {
129 LOGERR("MHExecMultiple: bad line in filter output: [" << ibuf << "]\n");
130 return false;
131 }
132 vector<string>::iterator it = tokens.begin();
133 name = *it++;
134 string& slen = *it;
135 int len;
136 if (sscanf(slen.c_str(), "%d", &len) != 1) {
137 LOGERR("MHExecMultiple: bad line in filter output: [" << ibuf << "]\n");
138 return false;
139 }
140
141 if (len / 1024 > m_maxmemberkb) {
142 LOGERR("MHExecMultiple: data len > maxmemberkb\n");
143 return false;
144 }
145
146 // Hack: check for 'Document:' and read directly the document data
147 // to m_metaData[cstr_dj_keycontent] to avoid an extra copy of the bulky
148 // piece
149 string *datap = &data;
150 if (!stringlowercmp("document:", name)) {
151 datap = &m_metaData[cstr_dj_keycontent];
152 } else {
153 datap = &data;
154 }
155
156 // Read element data
157 datap->erase();
158 if (len > 0 && m_cmd.receive(*datap, len) != len) {
159 LOGERR("MHExecMultiple: expected " << len << " bytes of data, got " <<
160 datap->length() << "\n");
161 return false;
162 }
163 LOGDEB1("MHExecMe:rdDtElt got: name [" << name << "] len " << len <<
164 "value [" << (datap->size() > 100 ?
165 (datap->substr(0, 100) + " ...") : *datap) << endl);
166 return true;
167 }
168
next_document()169 bool MimeHandlerExecMultiple::next_document()
170 {
171 LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n");
172 if (m_havedoc == false)
173 return false;
174
175 if (missingHelper) {
176 LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
177 m_reason = whatHelper;
178 return false;
179 }
180
181 if (m_cmd.getChildPid() <= 0 && !startCmd()) {
182 return false;
183 }
184
185 m_metaData.clear();
186
187 // Send request to child process. This maybe the first/only
188 // request for a given file, or a continuation request. We send an
189 // empty file name in the latter case.
190 // We also compute the file md5 before starting the extraction:
191 // under Windows, we may not be able to do it while the file
192 // is opened by the filter.
193 ostringstream obuf;
194 string file_md5;
195 if (m_filefirst) {
196 if (!m_forPreview && !m_nomd5) {
197 string md5, xmd5, reason;
198 if (MD5File(m_fn, md5, &reason)) {
199 file_md5 = MD5HexPrint(md5, xmd5);
200 } else {
201 LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
202 "]: " << reason << "\n");
203 }
204 }
205 obuf << "filename: " << m_fn.length() << "\n" << m_fn;
206 // m_filefirst is set to true by set_document_file()
207 m_filefirst = false;
208 } else {
209 obuf << "filename: " << 0 << "\n";
210 }
211 if (!m_ipath.empty()) {
212 LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
213 m_ipath << "]\n");
214 obuf << "ipath: " << m_ipath.length() << "\n" << m_ipath;
215 }
216 if (!m_dfltInputCharset.empty()) {
217 obuf << "dflincs: " << m_dfltInputCharset.length() << "\n"
218 << m_dfltInputCharset;
219 }
220 obuf << "mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
221 obuf << "\n";
222 if (m_cmd.send(obuf.str()) < 0) {
223 m_cmd.zapChild();
224 LOGERR("MHExecMultiple: send error\n");
225 return false;
226 }
227
228 m_adv.reset();
229
230 // Read answer (multiple elements)
231 LOGDEB1("MHExecMultiple: reading answer\n");
232 bool eofnext_received = false;
233 bool eofnow_received = false;
234 bool fileerror_received = false;
235 bool subdocerror_received = false;
236 string ipath;
237 string mtype;
238 string charset;
239 for (int loop=0;;loop++) {
240 string name, data;
241 try {
242 if (!readDataElement(name, data)) {
243 m_cmd.zapChild();
244 return false;
245 }
246 } catch (HandlerTimeout) {
247 LOGINFO("MHExecMultiple: timeout\n");
248 m_cmd.zapChild();
249 return false;
250 } catch (CancelExcept) {
251 LOGINFO("MHExecMultiple: interrupt\n");
252 m_cmd.zapChild();
253 return false;
254 }
255 if (name.empty())
256 break;
257 if (!stringlowercmp("eofnext:", name)) {
258 LOGDEB("MHExecMultiple: got EOFNEXT\n");
259 eofnext_received = true;
260 } else if (!stringlowercmp("eofnow:", name)) {
261 LOGDEB("MHExecMultiple: got EOFNOW\n");
262 eofnow_received = true;
263 } else if (!stringlowercmp("fileerror:", name)) {
264 LOGDEB("MHExecMultiple: got FILEERROR\n");
265 fileerror_received = true;
266 } else if (!stringlowercmp("subdocerror:", name)) {
267 LOGDEB("MHExecMultiple: got SUBDOCERROR\n");
268 subdocerror_received = true;
269 } else if (!stringlowercmp("ipath:", name)) {
270 ipath = data;
271 LOGDEB("MHExecMultiple: got ipath [" << data << "]\n");
272 } else if (!stringlowercmp("charset:", name)) {
273 charset = data;
274 LOGDEB("MHExecMultiple: got charset [" << data << "]\n");
275 } else if (!stringlowercmp("mimetype:", name)) {
276 mtype = data;
277 LOGDEB("MHExecMultiple: got mimetype [" << data << "]\n");
278 } else {
279 string nm = stringtolower((const string&)name);
280 trimstring(nm, ":");
281 LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n");
282 addmeta(m_metaData, nm, data);
283 }
284 if (loop == 200) {
285 // ??
286 LOGERR("MHExecMultiple: handler sent more than 200 attributes\n");
287 return false;
288 }
289 }
290
291 if (eofnow_received || fileerror_received) {
292 // No more docs
293 m_havedoc = false;
294 return false;
295 }
296 if (subdocerror_received) {
297 return false;
298 }
299
300 // It used to be that eof could be signalled just by an empty document, but
301 // this was wrong. Empty documents can be found ie in zip files and should
302 // not be interpreted as eof.
303 if (m_metaData[cstr_dj_keycontent].empty()) {
304 LOGDEB0("MHExecMultiple: got empty document inside [" << m_fn <<
305 "]: [" << ipath << "]\n");
306 }
307
308 if (!ipath.empty()) {
309 // If this has an ipath, it is an internal doc from a
310 // multi-document file. In this case, either the filter
311 // supplies the mimetype, or the ipath MUST be a filename-like
312 // string which we can use to compute a mime type
313 m_metaData[cstr_dj_keyipath] = ipath;
314 if (mtype.empty()) {
315 LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
316 "for a guess\n");
317 mtype = mimetype(ipath, 0, m_config, false);
318 if (mtype.empty()) {
319 // mimetype() won't call idFile when there is no file. Do it
320 mtype = idFileMem(m_metaData[cstr_dj_keycontent]);
321 if (mtype.empty()) {
322 // Note this happens for example for directory zip members
323 // We could recognize them by the end /, but wouldn't know
324 // what to do with them anyway.
325 LOGINFO("MHExecMultiple: cant guess mime type\n");
326 mtype = "application/octet-stream";
327 }
328 }
329 /* If we identify text/plain from the suffix (as opposed
330 to the handler setting the type), we use text/plain1
331 instead. As directed in mimeconf, this will cause the
332 text handler to be applied (instead of internfile just
333 ending things there), allowing splitting and default
334 charset conversions. */
335 if (mtype == "text/plain") {
336 mtype = "text/plain1";
337 }
338 }
339 m_metaData[cstr_dj_keymt] = mtype;
340 if (!m_forPreview) {
341 string md5, xmd5;
342 MD5String(m_metaData[cstr_dj_keycontent], md5);
343 m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
344 }
345 } else {
346 // "Self" document.
347 m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
348 m_metaData.erase(cstr_dj_keyipath);
349 if (!m_forPreview) {
350 m_metaData[cstr_dj_keymd5] = file_md5;
351 }
352 }
353
354 handle_cs(m_metaData[cstr_dj_keymt], charset);
355
356 if (eofnext_received)
357 m_havedoc = false;
358
359 LOGDEB0("MHExecMultiple: returning " <<
360 m_metaData[cstr_dj_keycontent].size() <<
361 " bytes of content, mtype [" << m_metaData[cstr_dj_keymt] <<
362 "] charset [" << m_metaData[cstr_dj_keycharset] << "]\n");
363 LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString());
364 return true;
365 }
366