1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, 2008, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine HTTP Handler
46 //
47 // Douglas Thrift
48 //
49 // $Id: HttpHandler.cpp 372 2008-08-23 11:00:12Z douglas $
50
51 #include "HttpHandler.hpp"
52
53 // Lovely C Sockets!
54 #ifndef _WIN32
55 // BSD Sockets
56 #include <unistd.h>
57 #include <sys/types.h>
58 #include <sys/socket.h>
59 #include <netinet/in.h>
60 #include <netdb.h>
61
closesocket(SOCKET s)62 inline int closesocket(SOCKET s) { return close(s); }
63 #endif
64
65 #ifndef _OpenSSL_
HttpHandler()66 HttpHandler::HttpHandler() : binary(false), length(0), chunked(false)
67 #else
68 HttpHandler::HttpHandler() : binary(false), length(0), chunked(false),
69 tls(false)
70 #endif
71 {
72 #ifdef _WIN32
73 if (WSAStartup(MAKEWORD(2, 0), &data) != 0)
74 {
75 error(program + ": WSAStartup");
76 exit(1);
77 }
78 #endif // _WIN32
79 }
80
~HttpHandler()81 HttpHandler::~HttpHandler()
82 {
83 #ifdef _WIN32
84 WSACleanup();
85 #endif // _WIN32
86 }
87
handle(URL & url,const string & referer,bool head)88 bool HttpHandler::handle(URL &url, const string& referer, bool head)
89 {
90 bool answer(false);
91
92 if ((http = socket(PF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET)
93 {
94 error(program + ": Socket");
95 exit(1);
96 }
97
98 sockaddr_in address;
99 hostent* host;
100
101 address.sin_family = AF_INET;
102
103 if ((host = gethostbyname(url.getAddress().c_str())) == NULL)
104 {
105 error(program + ": Host: " + url.getAddress(), true);
106
107 return answer;
108 }
109
110 address.sin_addr = *((in_addr*)*host->h_addr_list);
111 address.sin_port = htons(url.getPort());
112
113 if (connect(http, (sockaddr*)&address, sizeof(sockaddr_in)) ==
114 SOCKET_ERROR)
115 {
116 error(program + ": Connect");
117
118 return answer;
119 }
120
121 #ifdef _OpenSSL_
122 if (url.getTls())
123 {
124 tls = true;
125
126 if (!starttls()) return answer;
127 }
128 #endif
129
130 if (head)
131 {
132 putline("HEAD " + url.getPath() + " HTTP/1.1");
133 }
134 else
135 {
136 putline("GET " + url.getPath() + " HTTP/1.1");
137 }
138
139 putline("Accept: text/html; text/plain");
140 #ifndef _OpenSSL_
141 putline("User-Agent: " + agent(true) + ' ' + platform());
142
143 if (url.getPort() == 80)
144 #else
145 putline("User-Agent: " + agent(true) + ' ' + platform() + ' '
146 + openssl(true));
147
148 if (url.getPort() == 80 && tls || url.getPort() == 443 && tls)
149 #endif
150 {
151 putline("Host: " + url.getAddress());
152 }
153 else
154 {
155 ostringstream port;
156
157 port << url.getPort();
158
159 putline("Host: " + url.getAddress() + ':' + port.str());
160 }
161
162 if (!referer.empty())
163 {
164 putline("Referer: " + referer);
165 }
166
167 putline("Connection: close");
168 putline();
169
170 Code response;
171 string line;
172
173 do
174 {
175 line = getline();
176
177 if (line.find("HTTP/") != 0) return answer;
178
179 size_t dot(line.find('.')), space(line.find(' ')), major, minor;
180 istringstream number(line.substr(5, dot - 5) + " " + line.substr(dot
181 + 1, space - dot - 1));
182
183 number >> major;
184 number >> minor;
185
186 if (major > 1)
187 {
188 cerr << program << ": Potentially Incompatible Server: HTTP/" <<
189 major << "." << minor << "\n";
190
191 return answer;
192 }
193
194 number.clear();
195 number.str(line.substr(space + 1, 3));
196
197 number >> response;
198
199 if (response < ok) do line = getline(); while (!line.empty());
200 }
201 while (response < ok);
202
203 do
204 {
205 line = getline();
206
207 if (!line.empty())
208 {
209 size_t colon(line.find(':'));
210 string field(line.substr(0, colon)), value(line.substr(colon + 1));
211
212 while (isspace(value[0])) value.erase(0, 1);
213
214 if (field == "Content-Type")
215 {
216 type = value;
217 }
218 else if (field == "Content-Length")
219 {
220 istringstream number(value);
221
222 number >> length;
223 }
224 else if (field == "Location")
225 {
226 location = value;
227 }
228 else if (field == "Transfer-Encoding")
229 {
230 chunked = value == "chunked";
231 }
232 }
233 }
234 while (!line.empty());
235
236 switch (response)
237 {
238 case ok:
239 if (debug) cerr << "response = " << response << "\n";
240
241 answer = true;
242 break;
243 case choices:
244 case moved:
245 case found:
246 if (debug) cerr << "response = " << response << "\n"
247 << "location = " << location << "\n";
248
249 location = getLink(location, url);
250 break;
251 case notfound:
252 case internal:
253 if (debug) cerr << "response = " << response << "\n";
254 break;
255 default:
256 if (debug) cerr << "response = " << response << "\n";
257
258 if (response <= 299) answer = true; else if (response <= 399)
259 {
260 location = getLink(location, url);
261 }
262 break;
263 }
264
265 if (!head && answer) populate();
266
267 return answer;
268 }
269
clear()270 void HttpHandler::clear()
271 {
272 #ifdef _OpenSSL_
273 if (tls)
274 {
275 SSL_shutdown(ssl);
276 SSL_free(ssl);
277 SSL_CTX_free(ctx);
278 }
279 #endif
280
281 closesocket(http);
282
283 length = 0;
284
285 type.erase();
286 location.erase();
287 page.clear();
288 page.str("");
289
290 chunked = false;
291 #ifdef _OpenSSL_
292 tls = false;
293 #endif
294 }
295
populate()296 void HttpHandler::populate()
297 {
298 if (!chunked)
299 {
300 size_t left(length);
301
302 while (left > 0)
303 {
304 memset(buffer, 0, BUFSIZ + 1);
305
306 size_t bytes(left > BUFSIZ ? BUFSIZ : left);
307 long received;
308
309 while (true)
310 {
311 #ifndef _OpenSSL_
312 if ((received = recv(http, buffer, bytes, 0)) == SOCKET_ERROR)
313 {
314 error(program + ": Recv");
315 exit(1);
316 }
317 #else
318 if ((received = !tls ? recv(http, buffer, bytes, 0) :
319 SSL_read(ssl, buffer, bytes)) <= 0)
320 {
321 !tls ? error(program + ": Recv") : error(program +
322 ": SSL Read", int(received));
323 }
324 #endif
325 else if (received != bytes)
326 {
327 left -= received;
328
329 page << buffer;
330
331 memset(buffer, 0, BUFSIZ + 1);
332
333 bytes -= received;
334 }
335 else break;
336 }
337
338 page << buffer;
339
340 left -= bytes;
341 }
342 }
343 else
344 {
345 size_t chunk;
346
347 do
348 {
349 istringstream number(getline());
350
351 number.setf(ios_base::hex, ios_base::basefield);
352
353 number >> chunk;
354
355 size_t left(chunk);
356
357 while (left > 0)
358 {
359 memset(buffer, 0, BUFSIZ + 1);
360
361 size_t bytes(left > BUFSIZ ? BUFSIZ : left);
362 long received;
363
364 while (true)
365 {
366 #ifndef _OpenSSL_
367 if ((received = recv(http, buffer, bytes, 0)) ==
368 SOCKET_ERROR)
369 {
370 error(program + ": Recv");
371 exit(1);
372 }
373 #else
374 if ((received = !tls ? recv(http, buffer, bytes, 0) :
375 SSL_read(ssl, buffer, bytes)) <= 0)
376 {
377 !tls ? error(program + ": Recv") : error(program +
378 ": SSL Read", int(received));
379
380 exit(1);
381 }
382 #endif
383 else if (received != bytes)
384 {
385 left -= received;
386 page << buffer;
387
388 memset(buffer, 0, BUFSIZ + 1);
389
390 bytes -= received;
391 }
392 else break;
393 }
394
395 page << buffer;
396
397 left -= bytes;
398 }
399
400 getline();
401
402 length += chunk;
403 }
404 while (chunk > 0);
405 }
406
407 if (!binary)
408 {
409 string page(this->page.str());
410
411 for (size_t index(0); index < page.length(); index++)
412 {
413 if (page[index] == '\r' && (index + 1 < page.length()) ?
414 page[index + 1] == '\n' : false)
415 {
416 page.erase(index, 1);
417 }
418 else if (page[index] == '\r')
419 {
420 page[index] = '\n';
421 }
422 }
423
424 this->page.str(page);
425 }
426 }
427
putline(const string & line)428 void HttpHandler::putline(const string& line)
429 {
430 snprintf(buffer, BUFSIZ + 1, "%s\r\n", line.c_str());
431
432 #ifndef _OpenSSL_
433 if (send(http, buffer, strlen(buffer), 0) == SOCKET_ERROR)
434 {
435 error(program + ": Send");
436 exit(1);
437 }
438 #else
439 if (!tls)
440 {
441 if (send(http, buffer, strlen(buffer), 0) == SOCKET_ERROR)
442 {
443 error(program + ": Send");
444 exit(1);
445 }
446 }
447 else
448 {
449 int number;
450
451 if ((number = SSL_write(ssl, buffer, strlen(buffer))) <= 0)
452 {
453 error(program + ": SSL Write", number);
454 exit(1);
455 }
456 }
457 #endif
458 }
459
getline()460 string HttpHandler::getline()
461 {
462 string line;
463 char byte;
464
465 do
466 {
467 #ifndef _OpenSSL_
468 if (recv(http, &byte, 1, 0) == SOCKET_ERROR)
469 {
470 error(program + ": Recv");
471 }
472 #else
473 if (!tls)
474 {
475 if (recv(http, &byte, 1, 0) == SOCKET_ERROR)
476 {
477 error(program + ": Recv");
478 }
479 }
480 else
481 {
482 int number;
483
484 if ((number = SSL_read(ssl, &byte, 1)) <= 0)
485 {
486 error(program + ": SSL Read", number);
487 }
488 }
489 #endif
490
491 if (byte != '\r' && byte != '\n')
492 {
493 line += byte;
494 }
495 }
496 while (byte != '\n');
497
498 return line;
499 }
500
error(const string & prefix,bool host)501 void HttpHandler::error(const string& prefix, bool host)
502 {
503 #ifdef _WIN32
504 string error;
505
506 switch (WSAGetLastError())
507 {
508 case WSAEACCES:
509 error = "Permission denied";
510 break;
511 case WSAEADDRINUSE:
512 error = "Address already in use";
513 break;
514 case WSAEADDRNOTAVAIL:
515 error = "Cannot assign requested address";
516 break;
517 case WSAEAFNOSUPPORT:
518 error = "Address family not supported by protocol family";
519 break;
520 case WSAEALREADY:
521 error = "Operation already in progress";
522 break;
523 case WSAECONNABORTED:
524 error = "Software caused connection abort";
525 break;
526 case WSAECONNREFUSED:
527 error = "Connection refused";
528 break;
529 case WSAECONNRESET:
530 error = "Connection reset by peer";
531 break;
532 case WSAEDESTADDRREQ:
533 error = "Destination address required";
534 break;
535 case WSAEFAULT:
536 error = "Bad address";
537 break;
538 case WSAEHOSTDOWN:
539 error = "Host is down";
540 break;
541 case WSAEHOSTUNREACH:
542 error = "No route to host";
543 break;
544 case WSAEINPROGRESS:
545 error = "Operation now in progress";
546 break;
547 case WSAEINTR:
548 error = "Interrupted function call";
549 break;
550 case WSAEINVAL:
551 error = "Invalid argument";
552 break;
553 case WSAEISCONN:
554 error = "Socket is already connected";
555 break;
556 case WSAEMFILE:
557 error = "Too many open files";
558 break;
559 case WSAEMSGSIZE:
560 error = "Message too long";
561 break;
562 case WSAENETDOWN:
563 error = "Network is down";
564 break;
565 case WSAENETRESET:
566 error = "Network dropped connection on reset";
567 break;
568 case WSAENETUNREACH:
569 error = "Network is unreachable";
570 break;
571 case WSAENOBUFS:
572 error = "No buffer space available";
573 break;
574 case WSAENOPROTOOPT:
575 error = "Bad protocol option";
576 break;
577 case WSAENOTCONN:
578 error = "Socket is not connected";
579 break;
580 case WSAENOTSOCK:
581 error = "Socket operation on non-socket";
582 break;
583 case WSAEOPNOTSUPP:
584 error = "Operation not supported";
585 break;
586 case WSAEPFNOSUPPORT:
587 error = "Protocol family not supported";
588 break;
589 case WSAEPROCLIM:
590 error = "Too many processes";
591 break;
592 case WSAEPROTONOSUPPORT:
593 error = "Protocol not supported";
594 break;
595 case WSAEPROTOTYPE:
596 error = "Protocol wrong type for socket";
597 break;
598 case WSAESHUTDOWN:
599 error = "Cannot send after socket shutdown";
600 break;
601 case WSAESOCKTNOSUPPORT:
602 error = "Socket type not supported";
603 break;
604 case WSAETIMEDOUT:
605 error = "Connection timed out";
606 break;
607 case WSATYPE_NOT_FOUND:
608 error = "Class type not found";
609 break;
610 case WSAEWOULDBLOCK:
611 error = "Resource temporarily unavailable";
612 break;
613 case WSAHOST_NOT_FOUND:
614 error = "Host not found";
615 break;
616 case WSA_INVALID_HANDLE:
617 error = "Specified event object handle is invalid";
618 break;
619 case WSA_INVALID_PARAMETER:
620 error = "One or more parameters are invalid";
621 break;
622 // case WSAINVALIDPROCTABLE:
623 // error = "Invalid procedure table from service provider";
624 // break;
625 // case WSAINVALIDPROVIDER:
626 // error = "Invalid service provider version number";
627 // break;
628 case WSA_IO_INCOMPLETE:
629 error = "Overlapped I/O event object not in signaled state";
630 break;
631 case WSA_IO_PENDING:
632 error = "Overlapped operations will complete later";
633 break;
634 case WSA_NOT_ENOUGH_MEMORY:
635 error = "Insufficient memory available";
636 break;
637 case WSANOTINITIALISED:
638 error = "Successful WSAStartup not yet performed";
639 break;
640 case WSANO_DATA:
641 error = "Valid name, no data record of requested type";
642 break;
643 case WSANO_RECOVERY:
644 error = "This is a non-recoverable error";
645 break;
646 // case WSAPROVIDERFAILEDINIT:
647 // error = "Unable to initialize a service provider";
648 // break;
649 case WSASYSCALLFAILURE:
650 error = "System call failure";
651 break;
652 case WSASYSNOTREADY:
653 error = "Network subsystem is unavailable";
654 break;
655 case WSATRY_AGAIN:
656 error = "Non-authoritative host not found";
657 break;
658 case WSAVERNOTSUPPORTED:
659 error = "WINSOCK.DLL version out of range";
660 break;
661 case WSAEDISCON:
662 error = "Graceful shutdown in progress";
663 break;
664 case WSA_OPERATION_ABORTED:
665 error = "Overlapped operation aborted";
666 break;
667 default:
668 error = "Unknown error";
669 break;
670 }
671
672 cerr << prefix << ": " << error << "\n";
673 #else
674 if (host)
675 {
676 string error;
677
678 switch (h_errno)
679 {
680 case HOST_NOT_FOUND:
681 error = "Unknown host";
682 break;
683 case TRY_AGAIN:
684 error = "Host name lookup failure";
685 break;
686 case NO_RECOVERY:
687 error = "Unknown server error";
688 break;
689 case NO_DATA:
690 error = "No address associated with name";
691 break;
692 default:
693 error = "Unknown error";
694 break;
695 }
696
697 cerr << prefix << ": " << error << "\n";
698 }
699 else
700 {
701 perror(prefix.c_str());
702 }
703 #endif // _WIN32
704 }
705
706 #ifdef _OpenSSL_
error(const string & prefix,int number)707 void HttpHandler::error(const string& prefix, int number)
708 {
709 string error;
710
711 switch (SSL_get_error(ssl, number))
712 {
713 case SSL_ERROR_NONE:
714 error = "The TLS/SSL I/O operation completed";
715 break;
716 case SSL_ERROR_ZERO_RETURN:
717 error = "The TLS/SSL connection has been closed";
718 break;
719 case SSL_ERROR_WANT_READ:
720 case SSL_ERROR_WANT_WRITE:
721 case SSL_ERROR_WANT_CONNECT:
722 // case SSL_ERROR_WANT_ACCEPT:
723 case SSL_ERROR_WANT_X509_LOOKUP:
724 error = "The operation did not complete";
725 break;
726 case SSL_ERROR_SYSCALL:
727 if (int err = ERR_get_error() != 0)
728 {
729 error = ERR_reason_error_string(err);
730 }
731 else
732 {
733 switch (number)
734 {
735 case 0:
736 error = "An EOF was observed that violates the protocol";
737 break;
738 case -1:
739 this->error(prefix);
740 return;
741 default:
742 error = "Unknown error";
743 break;
744 }
745 }
746 break;
747 case SSL_ERROR_SSL:
748 error = ERR_reason_error_string(ERR_get_error());
749 break;
750 default:
751 error = "Unknown error";
752 break;
753 }
754
755 cerr << prefix << ": " << error << "\n";
756 }
757
starttls()758 bool HttpHandler::starttls()
759 {
760 SSL_load_error_strings();
761 SSL_library_init();
762
763 #ifndef _urandomdev_
764 int pid(getpid()), now(time(NULL));
765 size_t seed(now > pid ? now - pid : pid - now);
766 char* junk = new char[seed % 30 + 2];
767
768 junk[0] = pid;
769 junk[seed % 30 + 1] = now;
770
771 srand(seed);
772
773 for (size_t index = 1; index < seed % 30 + 1; index++)
774 {
775 junk[index] = rand();
776 }
777
778 if (debug)
779 {
780 cerr << "junk = {\n";
781
782 for (size_t index = 1; index < seed % 30 + 2; index++)
783 {
784 cerr << " [" << index << "] = " << size_t(junk[index]) << "\n";
785 }
786
787 cerr << "}\n";
788 }
789
790 RAND_seed(junk, seed % 30 + 2);
791
792 delete junk;
793 #else
794 if (debug) cerr << "junk = /dev/urandom\n";
795 #endif
796
797 ctx = SSL_CTX_new(TLSv1_client_method());
798
799 if (ctx == NULL)
800 {
801 cerr << program << ": SSL CTX New: "
802 << ERR_reason_error_string(ERR_get_error()) << "\n";
803
804 return false;
805 }
806
807 ssl = SSL_new(ctx);
808
809 if (SSL_set_fd(ssl, http) == 0)
810 {
811 cerr << program << ": SSL Set FD: "
812 << ERR_reason_error_string(ERR_get_error()) << "\n";
813
814 return false;
815 }
816
817 int number;
818
819 if ((number = SSL_connect(ssl)) <= 0)
820 {
821 error(program + ": SSL Connect", number);
822
823 return false;
824 }
825
826 return true;
827 }
828 #endif
829