1 //
2 // ExternalTransport.cc
3 //
4 // ExternalTransport: Allows external programs to retrieve given URLs with
5 //                    unknown protocols.
6 //
7 // Part of the ht://Dig package   <http://www.htdig.org/>
8 // Copyright (c) 1995-2004 The ht://Dig Group
9 // For copyright details, see the file COPYING in your distribution
10 // or the GNU Library General Public License (LGPL) version 2 or later
11 // <http://www.gnu.org/copyleft/lgpl.html>
12 //
13 // $Id: ExternalTransport.cc,v 1.9 2004/05/28 13:15:14 lha Exp $
14 //
15 
16 #ifdef HAVE_CONFIG_H
17 #include "htconfig.h"
18 #endif /* HAVE_CONFIG_H */
19 
20 #include "ExternalTransport.h"
21 #include "htdig.h"
22 #include "QuotedStringList.h"
23 #include "URL.h"
24 #include "Dictionary.h"
25 #include "good_strtok.h"
26 
27 #include <ctype.h>
28 #include <stdio.h>
29 
30 #ifndef _MSC_VER /* _WIN32 */
31 #include <unistd.h>
32 #endif
33 
34 #include <stdlib.h>
35 #ifdef HAVE_WAIT_H
36 #include <wait.h>
37 #elif HAVE_SYS_WAIT_H
38 #include <sys/wait.h>
39 #endif
40 
41 #include "defaults.h"
42 
43 static Dictionary	*handlers = 0;
44 static Dictionary	*toTypes = 0;
45 extern String		configFile;
46 
47 //*****************************************************************************
48 // ExternalTransport::ExternalTransport(char *protocol)
49 //
ExternalTransport(const String & protocol)50 ExternalTransport::ExternalTransport(const String &protocol)
51 {
52     if (canHandle(protocol))
53     {
54 	_Handler = ((String *)handlers->Find(protocol))->get();
55     }
56     ExternalTransport::_Protocol = protocol;
57     _Response = new ExternalTransport_Response;
58 }
59 
60 
61 //*****************************************************************************
62 // ExternalTransport::~ExternalTransport()
63 //
~ExternalTransport()64 ExternalTransport::~ExternalTransport()
65 {
66     if (_Response)
67     {
68 	delete _Response;
69     }
70 }
71 
72 
73 //*****************************************************************************
74 // int ExternalTransport::canHandle(const String &protocol)
75 //
76 int
canHandle(const String & protocol)77 ExternalTransport::canHandle(const String &protocol)
78 {
79 	HtConfiguration* config= HtConfiguration::config();
80     if (!handlers)
81     {
82 	handlers = new Dictionary();
83 	toTypes = new Dictionary();
84 
85 	QuotedStringList	qsl(config->Find("external_protocols"), " \t");
86 	String			from, to;
87 	int			i;
88 	int			sep;
89 
90 	for (i = 0; qsl[i]; i += 2)
91 	{
92 	    from = qsl[i];
93 	    to = "";
94 	    sep = from.indexOf("->");
95 	    if (sep != -1)
96 	    {
97 		to = from.sub(sep+2).get();
98 		from = from.sub(0, sep).get();
99 	    }
100 
101 	    // Recognise service specified as "https://" rather than "https"
102 	    sep = from.indexOf(":");
103 	    if (sep != -1)
104 		from = from.sub(0, sep).get();
105 
106 	    handlers->Add(from, new String(qsl[i + 1]));
107 	    toTypes->Add(from, new String(to));
108 	}
109     }
110     return handlers->Exists(protocol);
111 }
112 
113 
114 //*****************************************************************************
115 // void ExternalTransport::SetConnection(URL *u)
116 //
SetConnection(URL * u)117 void ExternalTransport::SetConnection (URL *u)
118 {
119     // Grab the actual URL to pass to the handler
120     _URL = *u;
121 
122     // OK, now call the parent method to make sure everything else is set up.
123     Transport::SetConnection (u->host(), u->port());
124 }
125 
126 
127 //*****************************************************************************
128 // DocStatus ExternalTransport::Request()
129 //
Request()130 Transport::DocStatus ExternalTransport::Request()
131 {
132 // NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32
133 #ifndef _MSC_VER /* _WIN32 */
134     //
135     // Start the external handler, passing the protocol, URL and config file
136     // as command arguments
137     //
138     StringList	hargs(_Handler);
139     char   **handlargs = new char * [hargs.Count() + 5];
140     int    argi;
141     for (argi = 0; argi < hargs.Count(); argi++)
142 	handlargs[argi] = (char *)hargs[argi];
143     handlargs[argi++] = _Protocol.get();
144     handlargs[argi++] = (char *)_URL.get().get();
145     handlargs[argi++] = configFile.get();
146     handlargs[argi++] = 0;
147 
148     int    stdout_pipe[2];
149     int	   fork_result = -1;
150     int	   fork_try;
151 
152     if (pipe(stdout_pipe) == -1)
153     {
154       if (debug)
155 	cerr << "External transport error: Can't create pipe!" << endl;
156       delete [] handlargs;
157       return GetDocumentStatus(_Response);
158     }
159 
160     for (fork_try = 4; --fork_try >= 0;)
161     {
162       fork_result = fork(); // Fork so we can execute in the child process
163       if (fork_result != -1)
164 	break;
165       if (fork_try)
166 	sleep(3);
167     }
168     if (fork_result == -1)
169     {
170       if (debug)
171 	cerr << "Fork Failure in ExternalTransport" << endl;
172       delete [] handlargs;
173       return GetDocumentStatus(_Response);
174     }
175 
176     if (fork_result == 0) // Child process
177     {
178 	close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
179 	dup(stdout_pipe[1]);
180 	close(stdout_pipe[0]);
181 	close(stdout_pipe[1]);
182 	// not really necessary, and may pose Cygwin incompatibility...
183 	//close(STDIN_FILENO); // Close STDIN to replace with null dev.
184 	//open("/dev/null", O_RDONLY);
185 
186 	// Call External Transport Handler
187 	execv(handlargs[0], handlargs);
188 
189 	exit(EXIT_FAILURE);
190     }
191 
192     // Parent Process
193     delete [] handlargs;
194     close(stdout_pipe[1]); // Close STDOUT for writing
195     FILE *input = fdopen(stdout_pipe[0], "r");
196     if (input == NULL)
197     {
198       if (debug)
199 	cerr << "Fdopen Failure in ExternalTransport" << endl;
200       return GetDocumentStatus(_Response);
201     }
202 
203     // Set up a response for this request
204     _Response->Reset();
205     // We just accessed the document
206     _Response->_access_time = new HtDateTime();
207     _Response->_access_time->SettoNow();
208 
209 
210     // OK, now parse the stuff we got back from the handler...
211     String	line;
212     char	*token1;
213     int		in_header = 1;
214 
215     while (in_header && readLine(input, line))
216     {
217 	line.chop('\r');
218 	if (line.length() > 0 && debug > 2)
219 	    cout << "Header line: " << line << endl;
220 	token1 = strtok(line, "\t");
221 	if (token1 == NULL)
222 	  {
223 	    token1 = "";
224 	    in_header = 0;
225 	    break;
226 	  }
227 
228 	switch (*token1)
229 	 {
230 	    case 's':	// status code
231 		token1 = strtok(0, "\t");
232 		if (token1 != NULL)
233 		  _Response->_status_code = atoi(token1);
234 		else
235 		  cerr<< "External transport error: expected status code in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
236 		break;
237 
238 	    case 'r':	// status reason
239 		token1 = strtok(0, "\t");
240 		if (token1 != NULL)
241 		  _Response->_reason_phrase = token1;
242 		else
243 		  cerr<< "External transport error: expected status reason in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
244 		break;
245 
246 	    case 'm':	// modification time
247 		token1 = strtok(0, "\t");
248 		if (token1 != NULL)
249 		  _Response->_modification_time= NewDate(token1);  // Hopefully we can grok it...
250 		else
251 		  cerr<< "External transport error: expected modification time in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
252 		break;
253 
254 	    case 't':	// Content-Type
255 		token1 = strtok(0, "\t");
256 		if (token1 != NULL)
257 		  _Response->_content_type = token1;
258 		else
259 		  cerr<< "External transport error: expected content-type in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
260 		break;
261 
262 	    case 'l':	// Content-Length
263 		token1 = strtok(0, "\t");
264 		if (token1 != NULL)
265 		  _Response->_content_length = atoi(token1);
266 		else
267 		  cerr<< "External transport error: expected content-length in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
268 		break;
269 
270 	    case 'u':	// redirect target
271 		token1 = strtok(0, "\t");
272 		if (token1 != NULL)
273 		  _Response->_location = token1;
274 		else
275 		  cerr<< "External transport error: expected URL in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
276 		break;
277 
278 	    default:
279 		  cerr<< "External transport error: unknown field in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
280 		break;
281 	}
282     }
283 
284     // OK, now we read in the rest of the document as contents...
285     _Response->_contents = 0;
286     char        docBuffer[8192];
287     int         bytesRead;
288 
289     while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), input)) > 0)
290       {
291         if (debug > 2)
292 	  cout << "Read " << bytesRead << " from document\n";
293         if (_Response->_contents.length() + bytesRead > _max_document_size)
294             bytesRead = _max_document_size - _Response->_contents.length();
295         _Response->_contents.append(docBuffer, bytesRead);
296         if (_Response->_contents.length() >= _max_document_size)
297             break;
298       }
299     _Response->_document_length = _Response->_contents.length();
300     fclose(input);
301     // close(stdout_pipe[0]); // This is closed for us by the fclose()
302 
303     int rpid, status;
304     while ((rpid = wait(&status)) != fork_result && rpid != -1)
305 	;
306 
307 #endif
308 
309     return GetDocumentStatus(_Response);
310 }
311 
312 
313 //*****************************************************************************
314 // private
315 // DocStatus ExternalTransport::GetDocumentStatus(ExternalTransport_Response *r)
316 //
GetDocumentStatus(ExternalTransport_Response * r)317 Transport::DocStatus ExternalTransport::GetDocumentStatus(ExternalTransport_Response *r)
318 {
319    // The default is 'not found' if we can't figure it out...
320    DocStatus returnStatus = Document_not_found;
321    int statuscode = r->GetStatusCode();
322 
323    if (statuscode == 200)
324    {
325 	    returnStatus = Document_ok;   // OK
326    	    // Is it parsable?
327    }
328 
329    else if (statuscode > 200 && statuscode < 300)
330 	    returnStatus = Document_ok;      	   	 // Successful 2xx
331    else if (statuscode == 304)
332 	    returnStatus = Document_not_changed;   	 // Not modified
333    else if (statuscode > 300 && statuscode < 400)
334 	    returnStatus = Document_redirect;      	 // Redirection 3xx
335    else if (statuscode == 401)
336 	    returnStatus = Document_not_authorized;   // Unauthorized
337 
338    return returnStatus;
339 }
340 
341 
342 //*****************************************************************************
343 // private
344 // int ExternalTransport::readLine(FILE *in, String &line)
345 //
346 int
readLine(FILE * in,String & line)347 ExternalTransport::readLine(FILE *in, String &line)
348 {
349     char	buffer[2048];
350     int		length;
351 
352     line = 0;
353     while (fgets(buffer, sizeof(buffer), in))
354     {
355 	length = strlen(buffer);
356 	if (buffer[length - 1] == '\n')
357 	{
358 	    //
359 	    // A full line has been read.  Return it.
360 	    //
361 	    line << buffer;
362 	    line.chop('\n');
363 	    return 1;
364 	}
365 	else
366 	{
367 	    //
368 	    // Only a partial line was read.  Append it to the line
369 	    // and read some more.
370 	    //
371 	    line << buffer;
372 	}
373     }
374     return line.length() > 0;
375 }
376 
377