1 /* ============================================================================
2  * Douglas Thrift's Search Engine License
3  *
4  * Copyright (C) 2002-2004, 2008, Douglas Thrift. All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  *    this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  *    this list of conditions and the following disclaimer in the documentation
13  *    and/or other materials provided with the distribution.
14  *
15  * 3. The end-user documentation included with the redistribution, if any, must
16  *    include the following acknowledgment:
17  *
18  *       "This product includes software developed by Douglas Thrift
19  *       (http://computers.douglasthrift.net/searchengine/)."
20  *
21  *    Alternately, this acknowledgment may appear in the software itself, if
22  *    and wherever such third-party acknowledgments normally appear.
23  *
24  * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25  *    be used to endorse or promote products derived from this software without
26  *    specific prior written permission.  For written permission, please visit
27  *    http://www.douglasthrift.net/contact.cgi for contact information.
28  *
29  * 5. Products derived from this software may not be called "Douglas Thrift's
30  *    Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31  *    name, without prior written permission.
32  *
33  * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39  * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43  * ============================================================================
44  */
45 // Douglas Thrift's Search Engine HTTP Handler
46 //
47 // Douglas Thrift
48 //
49 // $Id: HttpHandler.cpp 372 2008-08-23 11:00:12Z douglas $
50 
51 #include "HttpHandler.hpp"
52 
53 // Lovely C Sockets!
54 #ifndef _WIN32
55 // BSD Sockets
56 #include <unistd.h>
57 #include <sys/types.h>
58 #include <sys/socket.h>
59 #include <netinet/in.h>
60 #include <netdb.h>
61 
closesocket(SOCKET s)62 inline int closesocket(SOCKET s) { return close(s); }
63 #endif
64 
65 #ifndef _OpenSSL_
HttpHandler()66 HttpHandler::HttpHandler() : binary(false), length(0), chunked(false)
67 #else
68 HttpHandler::HttpHandler() : binary(false), length(0), chunked(false),
69 	tls(false)
70 #endif
71 {
72 #ifdef _WIN32
73 	if (WSAStartup(MAKEWORD(2, 0), &data) != 0)
74 	{
75 		error(program + ": WSAStartup");
76 		exit(1);
77 	}
78 #endif // _WIN32
79 }
80 
~HttpHandler()81 HttpHandler::~HttpHandler()
82 {
83 #ifdef _WIN32
84 	WSACleanup();
85 #endif // _WIN32
86 }
87 
handle(URL & url,const string & referer,bool head)88 bool HttpHandler::handle(URL &url, const string& referer, bool head)
89 {
90 	bool answer(false);
91 
92 	if ((http = socket(PF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET)
93 	{
94 		error(program + ": Socket");
95 		exit(1);
96 	}
97 
98 	sockaddr_in address;
99 	hostent* host;
100 
101 	address.sin_family = AF_INET;
102 
103 	if ((host = gethostbyname(url.getAddress().c_str())) == NULL)
104 	{
105 		error(program + ": Host: " + url.getAddress(), true);
106 
107 		return answer;
108 	}
109 
110 	address.sin_addr = *((in_addr*)*host->h_addr_list);
111 	address.sin_port = htons(url.getPort());
112 
113 	if (connect(http, (sockaddr*)&address, sizeof(sockaddr_in)) ==
114 		SOCKET_ERROR)
115 	{
116 		error(program + ": Connect");
117 
118 		return answer;
119 	}
120 
121 #ifdef _OpenSSL_
122 	if (url.getTls())
123 	{
124 		tls = true;
125 
126 		if (!starttls()) return answer;
127 	}
128 #endif
129 
130 	if (head)
131 	{
132 		putline("HEAD " + url.getPath() + " HTTP/1.1");
133 	}
134 	else
135 	{
136 		putline("GET " + url.getPath() + " HTTP/1.1");
137 	}
138 
139 	putline("Accept: text/html; text/plain");
140 #ifndef _OpenSSL_
141 	putline("User-Agent: " + agent(true) + ' ' + platform());
142 
143 	if (url.getPort() == 80)
144 #else
145 	putline("User-Agent: " + agent(true) + ' ' + platform() + ' '
146 		+ openssl(true));
147 
148 	if (url.getPort() == 80 && tls || url.getPort() == 443 && tls)
149 #endif
150 	{
151 		putline("Host: " + url.getAddress());
152 	}
153 	else
154 	{
155 		ostringstream port;
156 
157 		port << url.getPort();
158 
159 		putline("Host: " + url.getAddress() + ':' + port.str());
160 	}
161 
162 	if (!referer.empty())
163 	{
164 		putline("Referer: " + referer);
165 	}
166 
167 	putline("Connection: close");
168 	putline();
169 
170 	Code response;
171 	string line;
172 
173 	do
174 	{
175 		line = getline();
176 
177 		if (line.find("HTTP/") != 0) return answer;
178 
179 		size_t dot(line.find('.')), space(line.find(' ')), major, minor;
180 		istringstream number(line.substr(5, dot - 5) + " " + line.substr(dot
181 			+ 1, space - dot - 1));
182 
183 		number >> major;
184 		number >> minor;
185 
186 		if (major > 1)
187 		{
188 			cerr << program << ": Potentially Incompatible Server: HTTP/" <<
189 				major << "." << minor << "\n";
190 
191 			return answer;
192 		}
193 
194 		number.clear();
195 		number.str(line.substr(space + 1, 3));
196 
197 		number >> response;
198 
199 		if (response < ok) do line = getline(); while (!line.empty());
200 	}
201 	while (response < ok);
202 
203 	do
204 	{
205 		line = getline();
206 
207 		if (!line.empty())
208 		{
209 			size_t colon(line.find(':'));
210 			string field(line.substr(0, colon)), value(line.substr(colon + 1));
211 
212 			while (isspace(value[0])) value.erase(0, 1);
213 
214 			if (field == "Content-Type")
215 			{
216 				type = value;
217 			}
218 			else if (field == "Content-Length")
219 			{
220 				istringstream number(value);
221 
222 				number >> length;
223 			}
224 			else if (field == "Location")
225 			{
226 				location = value;
227 			}
228 			else if (field == "Transfer-Encoding")
229 			{
230 				chunked = value == "chunked";
231 			}
232 		}
233 	}
234 	while (!line.empty());
235 
236 	switch (response)
237 	{
238 	case ok:
239 		if (debug) cerr << "response = " << response << "\n";
240 
241 		answer = true;
242 		break;
243 	case choices:
244 	case moved:
245 	case found:
246 		if (debug) cerr << "response = " << response << "\n"
247 			<< "location = " << location << "\n";
248 
249 		location = getLink(location, url);
250 		break;
251 	case notfound:
252 	case internal:
253 		if (debug) cerr << "response = " << response << "\n";
254 		break;
255 	default:
256 		if (debug) cerr << "response = " << response << "\n";
257 
258 		if (response <= 299) answer = true; else if (response <= 399)
259 		{
260 			location = getLink(location, url);
261 		}
262 		break;
263 	}
264 
265 	if (!head && answer) populate();
266 
267 	return answer;
268 }
269 
clear()270 void HttpHandler::clear()
271 {
272 #ifdef _OpenSSL_
273 	if (tls)
274 	{
275 		SSL_shutdown(ssl);
276 		SSL_free(ssl);
277 		SSL_CTX_free(ctx);
278 	}
279 #endif
280 
281 	closesocket(http);
282 
283 	length = 0;
284 
285 	type.erase();
286 	location.erase();
287 	page.clear();
288 	page.str("");
289 
290 	chunked = false;
291 #ifdef _OpenSSL_
292 	tls = false;
293 #endif
294 }
295 
populate()296 void HttpHandler::populate()
297 {
298 	if (!chunked)
299 	{
300 		size_t left(length);
301 
302 		while (left > 0)
303 		{
304 			memset(buffer, 0, BUFSIZ + 1);
305 
306 			size_t bytes(left > BUFSIZ ? BUFSIZ : left);
307 			long received;
308 
309 			while (true)
310 			{
311 #ifndef _OpenSSL_
312 				if ((received = recv(http, buffer, bytes, 0)) == SOCKET_ERROR)
313 				{
314 					error(program + ": Recv");
315 					exit(1);
316 				}
317 #else
318 				if ((received = !tls ? recv(http, buffer, bytes, 0) :
319 					SSL_read(ssl, buffer, bytes)) <= 0)
320 				{
321 					!tls ? error(program + ": Recv") : error(program +
322 						": SSL Read", int(received));
323 				}
324 #endif
325 				else if (received != bytes)
326 				{
327 					left -= received;
328 
329 					page << buffer;
330 
331 					memset(buffer, 0, BUFSIZ + 1);
332 
333 					bytes -= received;
334 				}
335 				else break;
336 			}
337 
338 			page << buffer;
339 
340 			left -= bytes;
341 		}
342 	}
343 	else
344 	{
345 		size_t chunk;
346 
347 		do
348 		{
349 			istringstream number(getline());
350 
351 			number.setf(ios_base::hex, ios_base::basefield);
352 
353 			number >> chunk;
354 
355 			size_t left(chunk);
356 
357 			while (left > 0)
358 			{
359 				memset(buffer, 0, BUFSIZ + 1);
360 
361 				size_t bytes(left > BUFSIZ ? BUFSIZ : left);
362 				long received;
363 
364 				while (true)
365 				{
366 #ifndef _OpenSSL_
367 					if ((received = recv(http, buffer, bytes, 0)) ==
368 						SOCKET_ERROR)
369 					{
370 						error(program + ": Recv");
371 						exit(1);
372 					}
373 #else
374 					if ((received = !tls ? recv(http, buffer, bytes, 0) :
375 						SSL_read(ssl, buffer, bytes)) <= 0)
376 					{
377 						!tls ? error(program + ": Recv") : error(program +
378 							": SSL Read", int(received));
379 
380 						exit(1);
381 					}
382 #endif
383 					else if (received != bytes)
384 					{
385 						left -= received;
386 						page << buffer;
387 
388 						memset(buffer, 0, BUFSIZ + 1);
389 
390 						bytes -= received;
391 					}
392 					else break;
393 				}
394 
395 				page << buffer;
396 
397 				left -= bytes;
398 			}
399 
400 			getline();
401 
402 			length += chunk;
403 		}
404 		while (chunk > 0);
405 	}
406 
407 	if (!binary)
408 	{
409 		string page(this->page.str());
410 
411 		for (size_t index(0); index < page.length(); index++)
412 		{
413 			if (page[index] == '\r' && (index + 1 < page.length()) ?
414 				page[index + 1] == '\n' : false)
415 			{
416 				page.erase(index, 1);
417 			}
418 			else if (page[index] == '\r')
419 			{
420 				page[index] = '\n';
421 			}
422 		}
423 
424 		this->page.str(page);
425 	}
426 }
427 
putline(const string & line)428 void HttpHandler::putline(const string& line)
429 {
430 	snprintf(buffer, BUFSIZ + 1, "%s\r\n", line.c_str());
431 
432 #ifndef _OpenSSL_
433 	if (send(http, buffer, strlen(buffer), 0) == SOCKET_ERROR)
434 	{
435 		error(program + ": Send");
436 		exit(1);
437 	}
438 #else
439 	if (!tls)
440 	{
441 		if (send(http, buffer, strlen(buffer), 0) == SOCKET_ERROR)
442 		{
443 			error(program + ": Send");
444 			exit(1);
445 		}
446 	}
447 	else
448 	{
449 		int number;
450 
451 		if ((number = SSL_write(ssl, buffer, strlen(buffer))) <= 0)
452 		{
453 			error(program + ": SSL Write", number);
454 			exit(1);
455 		}
456 	}
457 #endif
458 }
459 
getline()460 string HttpHandler::getline()
461 {
462 	string line;
463 	char byte;
464 
465 	do
466 	{
467 #ifndef _OpenSSL_
468 		if (recv(http, &byte, 1, 0) == SOCKET_ERROR)
469 		{
470 			error(program + ": Recv");
471 		}
472 #else
473 		if (!tls)
474 		{
475 			if (recv(http, &byte, 1, 0) == SOCKET_ERROR)
476 			{
477 				error(program + ": Recv");
478 			}
479 		}
480 		else
481 		{
482 			int number;
483 
484 			if ((number = SSL_read(ssl, &byte, 1)) <= 0)
485 			{
486 				error(program + ": SSL Read", number);
487 			}
488 		}
489 #endif
490 
491 		if (byte != '\r' && byte != '\n')
492 		{
493 			line += byte;
494 		}
495 	}
496 	while (byte != '\n');
497 
498 	return line;
499 }
500 
error(const string & prefix,bool host)501 void HttpHandler::error(const string& prefix, bool host)
502 {
503 #ifdef _WIN32
504 	string error;
505 
506 	switch (WSAGetLastError())
507 	{
508 	case WSAEACCES:
509 		error = "Permission denied";
510 		break;
511 	case WSAEADDRINUSE:
512 		error = "Address already in use";
513 		break;
514 	case WSAEADDRNOTAVAIL:
515 		error = "Cannot assign requested address";
516 		break;
517 	case WSAEAFNOSUPPORT:
518 		error = "Address family not supported by protocol family";
519 		break;
520 	case WSAEALREADY:
521 		error = "Operation already in progress";
522 		break;
523 	case WSAECONNABORTED:
524 		error = "Software caused connection abort";
525 		break;
526 	case WSAECONNREFUSED:
527 		error = "Connection refused";
528 		break;
529 	case WSAECONNRESET:
530 		error = "Connection reset by peer";
531 		break;
532 	case WSAEDESTADDRREQ:
533 		error = "Destination address required";
534 		break;
535 	case WSAEFAULT:
536 		error = "Bad address";
537 		break;
538 	case WSAEHOSTDOWN:
539 		error = "Host is down";
540 		break;
541 	case WSAEHOSTUNREACH:
542 		error = "No route to host";
543 		break;
544 	case WSAEINPROGRESS:
545 		error = "Operation now in progress";
546 		break;
547 	case WSAEINTR:
548 		error = "Interrupted function call";
549 		break;
550 	case WSAEINVAL:
551 		error = "Invalid argument";
552 		break;
553 	case WSAEISCONN:
554 		error = "Socket is already connected";
555 		break;
556 	case WSAEMFILE:
557 		error = "Too many open files";
558 		break;
559 	case WSAEMSGSIZE:
560 		error = "Message too long";
561 		break;
562 	case WSAENETDOWN:
563 		error = "Network is down";
564 		break;
565 	case WSAENETRESET:
566 		error = "Network dropped connection on reset";
567 		break;
568 	case WSAENETUNREACH:
569 		error = "Network is unreachable";
570 		break;
571 	case WSAENOBUFS:
572 		error = "No buffer space available";
573 		break;
574 	case WSAENOPROTOOPT:
575 		error = "Bad protocol option";
576 		break;
577 	case WSAENOTCONN:
578 		error = "Socket is not connected";
579 		break;
580 	case WSAENOTSOCK:
581 		error = "Socket operation on non-socket";
582 		break;
583 	case WSAEOPNOTSUPP:
584 		error = "Operation not supported";
585 		break;
586 	case WSAEPFNOSUPPORT:
587 		error = "Protocol family not supported";
588 		break;
589 	case WSAEPROCLIM:
590 		error = "Too many processes";
591 		break;
592 	case WSAEPROTONOSUPPORT:
593 		error = "Protocol not supported";
594 		break;
595 	case WSAEPROTOTYPE:
596 		error = "Protocol wrong type for socket";
597 		break;
598 	case WSAESHUTDOWN:
599 		error = "Cannot send after socket shutdown";
600 		break;
601 	case WSAESOCKTNOSUPPORT:
602 		error = "Socket type not supported";
603 		break;
604 	case WSAETIMEDOUT:
605 		error = "Connection timed out";
606 		break;
607 	case WSATYPE_NOT_FOUND:
608 		error = "Class type not found";
609 		break;
610 	case WSAEWOULDBLOCK:
611 		error = "Resource temporarily unavailable";
612 		break;
613 	case WSAHOST_NOT_FOUND:
614 		error = "Host not found";
615 		break;
616 	case WSA_INVALID_HANDLE:
617 		error = "Specified event object handle is invalid";
618 		break;
619 	case WSA_INVALID_PARAMETER:
620 		error = "One or more parameters are invalid";
621 		break;
622 //	case WSAINVALIDPROCTABLE:
623 //		error = "Invalid procedure table from service provider";
624 //		break;
625 //	case WSAINVALIDPROVIDER:
626 //		error = "Invalid service provider version number";
627 //		break;
628 	case WSA_IO_INCOMPLETE:
629 		error = "Overlapped I/O event object not in signaled state";
630 		break;
631 	case WSA_IO_PENDING:
632 		error = "Overlapped operations will complete later";
633 		break;
634 	case WSA_NOT_ENOUGH_MEMORY:
635 		error = "Insufficient memory available";
636 		break;
637 	case WSANOTINITIALISED:
638 		error = "Successful WSAStartup not yet performed";
639 		break;
640 	case WSANO_DATA:
641 		error = "Valid name, no data record of requested type";
642 		break;
643 	case WSANO_RECOVERY:
644 		error = "This is a non-recoverable error";
645 		break;
646 //	case WSAPROVIDERFAILEDINIT:
647 //		error = "Unable to initialize a service provider";
648 //		break;
649 	case WSASYSCALLFAILURE:
650 		error = "System call failure";
651 		break;
652 	case WSASYSNOTREADY:
653 		error = "Network subsystem is unavailable";
654 		break;
655 	case WSATRY_AGAIN:
656 		error = "Non-authoritative host not found";
657 		break;
658 	case WSAVERNOTSUPPORTED:
659 		error = "WINSOCK.DLL version out of range";
660 		break;
661 	case WSAEDISCON:
662 		error = "Graceful shutdown in progress";
663 		break;
664 	case WSA_OPERATION_ABORTED:
665 		error = "Overlapped operation aborted";
666 		break;
667 	default:
668 		error = "Unknown error";
669 		break;
670 	}
671 
672 	cerr << prefix << ": " << error << "\n";
673 #else
674 	if (host)
675 	{
676 		string error;
677 
678 		switch (h_errno)
679 		{
680 		case HOST_NOT_FOUND:
681 			error = "Unknown host";
682 			break;
683 		case TRY_AGAIN:
684 			error = "Host name lookup failure";
685 			break;
686 		case NO_RECOVERY:
687 			error = "Unknown server error";
688 			break;
689 		case NO_DATA:
690 			error = "No address associated with name";
691 			break;
692 		default:
693 			error = "Unknown error";
694 			break;
695 		}
696 
697 		cerr << prefix << ": " << error << "\n";
698 	}
699 	else
700 	{
701 		perror(prefix.c_str());
702 	}
703 #endif // _WIN32
704 }
705 
706 #ifdef _OpenSSL_
error(const string & prefix,int number)707 void HttpHandler::error(const string& prefix, int number)
708 {
709 	string error;
710 
711 	switch (SSL_get_error(ssl, number))
712 	{
713 	case SSL_ERROR_NONE:
714 		error = "The TLS/SSL I/O operation completed";
715 		break;
716 	case SSL_ERROR_ZERO_RETURN:
717 		error = "The TLS/SSL connection has been closed";
718 		break;
719 	case SSL_ERROR_WANT_READ:
720 	case SSL_ERROR_WANT_WRITE:
721 	case SSL_ERROR_WANT_CONNECT:
722 //	case SSL_ERROR_WANT_ACCEPT:
723 	case SSL_ERROR_WANT_X509_LOOKUP:
724 		error = "The operation did not complete";
725 		break;
726 	case SSL_ERROR_SYSCALL:
727 		if (int err = ERR_get_error() != 0)
728 		{
729 			error = ERR_reason_error_string(err);
730 		}
731 		else
732 		{
733 			switch (number)
734 			{
735 			case 0:
736 				error = "An EOF was observed that violates the protocol";
737 				break;
738 			case -1:
739 				this->error(prefix);
740 				return;
741 			default:
742 				error = "Unknown error";
743 				break;
744 			}
745 		}
746 		break;
747 	case SSL_ERROR_SSL:
748 		error = ERR_reason_error_string(ERR_get_error());
749 		break;
750 	default:
751 		error = "Unknown error";
752 		break;
753 	}
754 
755 	cerr << prefix << ": " << error << "\n";
756 }
757 
starttls()758 bool HttpHandler::starttls()
759 {
760 	SSL_load_error_strings();
761 	SSL_library_init();
762 
763 #ifndef _urandomdev_
764 	int pid(getpid()), now(time(NULL));
765 	size_t seed(now > pid ? now - pid : pid - now);
766 	char* junk = new char[seed % 30 + 2];
767 
768 	junk[0] = pid;
769 	junk[seed % 30 + 1] = now;
770 
771 	srand(seed);
772 
773 	for (size_t index = 1; index < seed % 30 + 1; index++)
774 	{
775 		junk[index] = rand();
776 	}
777 
778 	if (debug)
779 	{
780 		cerr << "junk = {\n";
781 
782 		for (size_t index = 1; index < seed % 30 + 2; index++)
783 		{
784 			cerr << "   [" << index << "] = " << size_t(junk[index]) << "\n";
785 		}
786 
787 		cerr << "}\n";
788 	}
789 
790 	RAND_seed(junk, seed % 30 + 2);
791 
792 	delete junk;
793 #else
794 	if (debug) cerr << "junk = /dev/urandom\n";
795 #endif
796 
797 	ctx = SSL_CTX_new(TLSv1_client_method());
798 
799 	if (ctx == NULL)
800 	{
801 		cerr << program << ": SSL CTX New: "
802 			<< ERR_reason_error_string(ERR_get_error()) << "\n";
803 
804 		return false;
805 	}
806 
807 	ssl = SSL_new(ctx);
808 
809 	if (SSL_set_fd(ssl, http) == 0)
810 	{
811 		cerr << program << ": SSL Set FD: "
812 			<< ERR_reason_error_string(ERR_get_error()) << "\n";
813 
814 		return false;
815 	}
816 
817 	int number;
818 
819 	if ((number = SSL_connect(ssl)) <= 0)
820 	{
821 		error(program + ": SSL Connect", number);
822 
823 		return false;
824 	}
825 
826 	return true;
827 }
828 #endif
829